ofermend commited on
Commit
d0c57df
·
1 Parent(s): 8bb0636

updated to gradio; python 3.11; visual improvements

Browse files
Dockerfile CHANGED
@@ -1,8 +1,7 @@
1
- FROM python:3.10
2
 
3
  WORKDIR /app
4
 
5
- COPY ./app/vectara_theme.py /app/vectara_theme.py
6
  COPY ./app/requirements.txt /app/requirements.txt
7
  COPY ./app/app.py /app/app.py
8
  COPY ./app/app_utils.py /app/app_utils.py
@@ -18,4 +17,4 @@ ENV HOME=/home/user \
18
  RUN mkdir -p /app/results
19
  RUN chown -R user /app
20
 
21
- CMD ["funix", "app.py", "--host", "0.0.0.0", "--port", "7860", "--no-browser"]
 
1
+ FROM python:3.11
2
 
3
  WORKDIR /app
4
 
 
5
  COPY ./app/requirements.txt /app/requirements.txt
6
  COPY ./app/app.py /app/app.py
7
  COPY ./app/app_utils.py /app/app_utils.py
 
17
  RUN mkdir -p /app/results
18
  RUN chown -R user /app
19
 
20
+ CMD ["python", "app.py"]
app/app.py CHANGED
@@ -1,62 +1,128 @@
1
- from typing import Callable, Literal, List, Tuple
2
- import json
3
-
4
  import pandas as pd
5
- import matplotlib.figure
6
- from IPython.display import Markdown
7
-
8
- from funix import funix, import_theme
9
- from vectara_theme import vectara_theme
10
- import_theme(vectara_theme)
11
 
12
  from app_utils import load_results, visualize_leaderboard
13
 
14
  results_df = load_results()
15
 
16
- @funix(
17
- title="Hughes Hallucination Evaluation Model (HHEM) Leaderboard",
18
- direction="column",
19
- autorun="always",
20
- theme="vectara",
21
- matplotlib_format="svg",
22
- # output_layout=[
23
- # [{"return_index": 0, "width": 0.3}],
24
- # [{"return_index": 1, "width": 0.7}],
25
- # ]
26
- )
27
- def leaderboard(
28
- filter_models_by_name: str = ""
29
- # filter_models_by_name: List[Literal["all", "anthropic", "google", "meta", "openai", "xai", "qwen"]] = ["all"]
30
- ) -> Tuple[Markdown, matplotlib.figure.Figure, pd.DataFrame]:
31
- # ) -> Tuple[Markdown, pd.DataFrame]:
32
- """# Hughes Hallucination Evaluation Model (HHEM) Leaderboard
33
 
34
- Using [Vectara](https://vectara.com/)'s proprietary [HHEM](https://www.vectara.com/blog/hhem-2-1-a-better-hallucination-detection-model), this leaderboard evaluates how often an LLM hallucinates -- containing information not stated in the source document -- when summarizing a document. For an LLM, its hallucination rate is defined as the ratio of summaries that hallucinate to the total number of summaries it generates. HHEM's open source version is available [here](https://huggingface.co/vectara/hallucination_evaluation_model). For more details or to contribute, see [this Github repo](https://github.com/vectara/hallucination-leaderboard).
35
 
36
- **Usage:**
 
 
 
 
 
 
 
37
 
38
- * All LLMs are displayed by default. To filter, enter the names of the models that you want to see in the "Filter Models by Name" field below, separated by commas or semicolons.
39
- * Results are paginated. To page thru, use the `<` or `>` buttons at the bottom right corner of the table.
40
- * To sort the table, hover over a column header and click the arrow. The arrow automatically points up and down depending on the sort order.
41
- * Click the "Refresh" button to refresh the leaderboard if the table is not shown or does not update when you change the filter.
42
 
43
- Args:
44
- filter_models_by_name: filter models by name using comma-separated strings
45
- """
46
- df = results_df
 
 
 
47
 
 
48
  filter_models_by_name = filter_models_by_name.replace(",", ";").replace(" ", "")
49
- if len(filter_models_by_name) > 0 and "all" not in filter_models_by_name:
50
- filter_models_by_name = filter_models_by_name.split(";")
51
- # filter_models_by_name = [name for name in filter_models_by_name if name != "all"]
52
- filter_models_by_name = [name for name in filter_models_by_name if name != ""]
53
- df = df.copy()
54
- df = df[df["LLM_lower_case"].str.contains("|".join(filter_models_by_name), na=False)]
55
-
56
- if len(df) == 0: # return an empty DF and an empty figure
57
- return Markdown(f"No models found matching: {filter_models_by_name}"), matplotlib.figure.Figure(), pd.DataFrame()
58
-
59
- # return Markdown(""), df
 
 
60
 
61
  fig = visualize_leaderboard(df)
62
- return Markdown(""), fig, df[["LLM", "Hallucination %", "Answer %", "Avg Summary Words"]]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
 
 
2
  import pandas as pd
3
+ import matplotlib.pyplot
 
 
 
 
 
4
 
5
  from app_utils import load_results, visualize_leaderboard
6
 
7
  results_df = load_results()
8
 
9
+ DESCRIPTION = """
10
+ # Hughes Hallucination Evaluation Model (HHEM) Leaderboard
11
+
12
+ Using [Vectara](https://vectara.com/)'s proprietary [Factual Consistency Evaluation Model](https://www.vectara.com/blog/hallucination-detection-commercial-vs-open-source-a-deep-dive),
13
+ this leaderboard evaluates how often an LLM hallucinates -- containing information not stated in the source document -- when summarizing a document.
14
+ For an LLM, its hallucination rate is defined as the ratio of summaries that hallucinate to the total number of summaries it generates.
15
+ For more details or to contribute, see [this Github repo](https://github.com/vectara/hallucination-leaderboard).
16
+ """
 
 
 
 
 
 
 
 
 
17
 
 
18
 
19
+ def leaderboard(
20
+ filter_models_by_name: str = "",
21
+ high_ar_only: bool = False,
22
+ size_filter: str = "all",
23
+ access_filter: str = "all"
24
+ ):
25
+ """Filter and display the leaderboard."""
26
+ df = results_df.copy()
27
 
28
+ # Filter by answer rate if toggle is on
29
+ if high_ar_only:
30
+ df = df[df["Answer %"] >= 95]
 
31
 
32
+ # Filter by model size
33
+ if size_filter and size_filter != "all":
34
+ df = df[df["Model Size"] == size_filter]
35
+
36
+ # Filter by accessibility
37
+ if access_filter and access_filter != "all":
38
+ df = df[df["Accessibility"] == access_filter]
39
 
40
+ # Filter by model name
41
  filter_models_by_name = filter_models_by_name.replace(",", ";").replace(" ", "")
42
+ if len(filter_models_by_name) > 0 and "all" not in filter_models_by_name.lower():
43
+ filter_list = [name.lower() for name in filter_models_by_name.split(";") if name]
44
+ df = df[df["LLM_lower_case"].str.contains("|".join(filter_list), na=False)]
45
+
46
+ if len(df) == 0:
47
+ # Show "no results" message in the plot
48
+ fig, ax = matplotlib.pyplot.subplots(figsize=(10, 5))
49
+ ax.text(0.5, 0.5, "No models found matching your filter",
50
+ ha='center', va='center', fontsize=14, color='gray')
51
+ ax.set_xlim(0, 1)
52
+ ax.set_ylim(0, 1)
53
+ ax.axis('off')
54
+ return fig, pd.DataFrame(columns=["LLM", "Hallucination %", "Answer %", "Avg Summary Words"])
55
 
56
  fig = visualize_leaderboard(df)
57
+ return fig, df[["LLM", "Hallucination %", "Answer %", "Avg Summary Words"]]
58
+
59
+
60
+ with gr.Blocks(
61
+ title="Hughes Hallucination Evaluation Model (HHEM) Leaderboard",
62
+ theme=gr.themes.Soft(),
63
+ css="""
64
+ .header-logo {
65
+ display: flex;
66
+ align-items: center;
67
+ gap: 10px;
68
+ margin-bottom: 10px;
69
+ }
70
+ .header-logo img {
71
+ height: 40px;
72
+ }
73
+ footer { display: none !important; }
74
+ """
75
+ ) as demo:
76
+ gr.HTML(
77
+ '<div class="header-logo">'
78
+ '<img src="https://huggingface.co/spaces/vectara/README/resolve/main/Vectara-logo.png" alt="Vectara">'
79
+ '</div>'
80
+ )
81
+ gr.Markdown(DESCRIPTION)
82
+
83
+ with gr.Row():
84
+ with gr.Column(scale=3):
85
+ plot_output = gr.Plot(show_label=False)
86
+ with gr.Column(scale=1):
87
+ filter_input = gr.Textbox(
88
+ placeholder="Filter models...",
89
+ show_label=False,
90
+ value=""
91
+ )
92
+ high_ar_toggle = gr.Checkbox(
93
+ label="Only models with ≥95% answer rate",
94
+ value=False
95
+ )
96
+ size_filter = gr.Radio(
97
+ choices=["all", "small", "large"],
98
+ value="all",
99
+ label="Model size"
100
+ )
101
+ access_filter = gr.Radio(
102
+ choices=["all", "commercial", "open"],
103
+ value="all",
104
+ label="Model type"
105
+ )
106
+
107
+ with gr.Row():
108
+ table_output = gr.Dataframe(
109
+ label="Leaderboard",
110
+ interactive=False,
111
+ max_height=500
112
+ )
113
+
114
+ inputs = [filter_input, high_ar_toggle, size_filter, access_filter]
115
+ outputs = [plot_output, table_output]
116
+
117
+ # Load initial data on page load
118
+ demo.load(fn=leaderboard, inputs=inputs, outputs=outputs)
119
+
120
+ # Update on filter change or toggle change
121
+ filter_input.change(fn=leaderboard, inputs=inputs, outputs=outputs)
122
+ high_ar_toggle.change(fn=leaderboard, inputs=inputs, outputs=outputs)
123
+ size_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
124
+ access_filter.change(fn=leaderboard, inputs=inputs, outputs=outputs)
125
+
126
+
127
+ if __name__ == "__main__":
128
+ demo.launch(server_name="0.0.0.0", server_port=7860)
app/app_utils.py CHANGED
@@ -1,22 +1,23 @@
1
  # %%
2
- import os
3
  import json
4
- from huggingface_hub import Repository
5
  import pandas as pd
6
  import matplotlib.pyplot as plt
7
  import matplotlib.figure
8
  from datetime import datetime
9
  from sklearn.preprocessing import MinMaxScaler
10
-
11
- # import dotenv
12
- # dotenv.load_dotenv()
13
 
14
  min_max_scaler = MinMaxScaler()
15
 
16
  # %%
17
  def pull_results(results_dir: str):
18
- repo = Repository(local_dir = results_dir, clone_from="vectara/results", repo_type="dataset")
19
- repo.git_pull()
 
 
 
20
 
21
  def extract_info_from_result_file(result_file):
22
  """
@@ -43,12 +44,19 @@ def extract_info_from_result_file(result_file):
43
  """
44
 
45
  info = json.load(open(result_file, 'r'))
 
 
 
 
 
 
46
  result = {
47
- "LLM": info["config"]["model_name"],
48
  "Hallucination %": info["results"]["hallucination_rate"]["hallucination_rate"],
49
- # "Factual Consistency Rate": info["results"]["factual_consistency_rate"]["factual_consistency_rate"],
50
  "Answer %": info["results"]["answer_rate"]["answer_rate"],
51
  "Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"],
 
 
52
  }
53
  return result
54
 
@@ -63,8 +71,8 @@ def get_latest_result_file(dir: str):
63
  if len(files) == 0:
64
  return None
65
  files.sort(key=lambda x: os.path.getmtime(os.path.join(dir, x)))
66
- # print ("Scanning: ", dir, "found latest file: ", files[0])
67
- return os.path.join(dir, files[0])
68
 
69
  def scan_and_extract(dir: str):
70
  """Scan all folders recursively and exhaustively to load all JSON files and call `extract_info_from_result_file` on each one.
@@ -80,43 +88,26 @@ def scan_and_extract(dir: str):
80
  results.append(extract_info_from_result_file(result_file))
81
  return results
82
 
83
- def load_results(
84
- results_dir: str = "./results",
85
- results_json: str = "./results.json"
86
- ):
87
-
88
- try:
89
- pull_results(results_dir)
90
- print (f"Successfully pulled results from {results_dir}")
91
- except Exception as e:
92
- print(f"Failed to pull and/or extract latest results: {e}")
93
-
94
- try:
95
- results = scan_and_extract(results_dir)
96
- if len(results) > 0:
97
- with open(results_json, "w") as f:
98
- json.dump(results, f, indent=2)
99
- print(f"Successfully scanned and extracted results from {results_dir} and saved to {results_json}")
100
- else:
101
- print(f"No results found in {results_dir}")
102
- except Exception as e:
103
- print(f"Failed to scan and extract results from {results_dir}: {e}")
104
- print(f"Using pre-dumped results from {results_json}")
105
 
106
- results = json.load(open(results_json, "r"))
107
- # print(results)
108
 
109
  results_df = pd.DataFrame(results)
110
  results_df = results_df.sort_values(by="Hallucination %", ascending=True)
111
-
112
- # replace any value TBD with -1
113
  results_df = results_df.replace("TBD", 100)
114
 
115
  for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
116
  results_df[column] = results_df[column].apply(lambda x: round(x, 3))
117
 
118
  results_df["LLM_lower_case"] = results_df["LLM"].str.lower()
119
-
120
  return results_df
121
 
122
  # %%
@@ -150,53 +141,51 @@ def determine_llm_x_position_and_font_color(LLM: str, hallucination_percent: flo
150
  return hallucination_percent, 'black'
151
 
152
  def visualize_leaderboard(df: pd.DataFrame) -> matplotlib.figure.Figure:
153
- fig = plt.figure(figsize=(8, 4))
154
- # plot using LLM as x-axis and Hallucination % as y-axis
155
- # make bars horizontal
156
- plot_df = df.head(10)
157
  plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(plot_df[["Hallucination %"]])
158
 
159
- plt.barh(plot_df["LLM"], plot_df["Hallucination %"], color=plt.cm.jet(plot_df["normalized_hallucination_rate"]))
 
 
160
 
161
- # plot_df["LLM_x_position"], plot_df["font_color"] = zip(*plot_df.apply(
162
- # lambda row: determine_llm_x_position_and_font_color(row["LLM"], row["Hallucination %"]),
163
- # axis=1
164
- # ))
165
-
166
- for i, row in plot_df.iterrows():
167
- plt.text(
168
- # row["LLM_x_position"],
169
- row["Hallucination %"] + 0.025,
170
- row["LLM"],
171
- row["Hallucination %"],
172
- # f"{row['LLM']}",
173
- ha='left',
174
- va='center',
175
- fontsize=9,
176
- # color=row["font_color"]
177
- )
178
- # plt.yticks([])
179
- plt.tight_layout()
180
 
181
- # add margin to the right of the plot
182
- plt.subplots_adjust(right=0.95)
 
 
 
 
183
 
184
- plt.xticks(fontsize=9)
185
- plt.xlabel(f"Copyright (2025) Vectara, Inc. Plot generated on: {datetime.now().strftime('%B %d, %Y')}", fontsize=9)
 
 
186
  plt.title("Grounded Hallucination Rate of Best LLMs", fontsize=12)
 
187
  plt.gca().spines['top'].set_visible(False)
188
  plt.gca().spines['right'].set_visible(False)
189
- plt.gca().spines['left'].set_visible(False)
190
- plt.gca().invert_yaxis() # Invert the y-axis to display bars top-down
 
 
 
 
 
 
 
 
 
 
191
 
192
  return fig
193
 
194
  # %%
195
 
196
  if __name__ == "__main__":
197
- results = scan_and_extract("./results")
198
- with open("./results.json", "w") as f:
199
- json.dump(results, f, indent=2)
200
 
201
  # %%
202
 
 
1
  # %%
2
+ import os
3
  import json
4
+ from huggingface_hub import snapshot_download
5
  import pandas as pd
6
  import matplotlib.pyplot as plt
7
  import matplotlib.figure
8
  from datetime import datetime
9
  from sklearn.preprocessing import MinMaxScaler
10
+ import matplotlib.patheffects as pe
 
 
11
 
12
  min_max_scaler = MinMaxScaler()
13
 
14
  # %%
15
  def pull_results(results_dir: str):
16
+ snapshot_download(
17
+ repo_id="vectara/results",
18
+ repo_type="dataset",
19
+ local_dir=results_dir
20
+ )
21
 
22
  def extract_info_from_result_file(result_file):
23
  """
 
44
  """
45
 
46
  info = json.load(open(result_file, 'r'))
47
+
48
+ # Extract model_annotations with defaults for missing data
49
+ annotations = info.get("model_annotations", {})
50
+ model_size = annotations.get("model_size", "unknown")
51
+ accessibility = annotations.get("accessibility", "unknown")
52
+
53
  result = {
54
+ "LLM": info["config"]["model_name"].rstrip("-"),
55
  "Hallucination %": info["results"]["hallucination_rate"]["hallucination_rate"],
 
56
  "Answer %": info["results"]["answer_rate"]["answer_rate"],
57
  "Avg Summary Words": info["results"]["average_summary_length"]["average_summary_length"],
58
+ "Model Size": model_size,
59
+ "Accessibility": accessibility,
60
  }
61
  return result
62
 
 
71
  if len(files) == 0:
72
  return None
73
  files.sort(key=lambda x: os.path.getmtime(os.path.join(dir, x)))
74
+ # Return the last file (most recent by mtime)
75
+ return os.path.join(dir, files[-1])
76
 
77
  def scan_and_extract(dir: str):
78
  """Scan all folders recursively and exhaustively to load all JSON files and call `extract_info_from_result_file` on each one.
 
88
  results.append(extract_info_from_result_file(result_file))
89
  return results
90
 
91
+ def load_results(results_dir: str = "/tmp/hhem_results"):
92
+ """Load results from HuggingFace dataset, processed entirely in memory."""
93
+ pull_results(results_dir)
94
+ print(f"Successfully pulled results from HuggingFace to {results_dir}")
95
+
96
+ results = scan_and_extract(results_dir)
97
+ if not results:
98
+ raise ValueError(f"No results found in {results_dir}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
+ print(f"Successfully extracted {len(results)} results")
 
101
 
102
  results_df = pd.DataFrame(results)
103
  results_df = results_df.sort_values(by="Hallucination %", ascending=True)
 
 
104
  results_df = results_df.replace("TBD", 100)
105
 
106
  for column in ["Hallucination %", "Answer %", "Avg Summary Words"]:
107
  results_df[column] = results_df[column].apply(lambda x: round(x, 3))
108
 
109
  results_df["LLM_lower_case"] = results_df["LLM"].str.lower()
110
+
111
  return results_df
112
 
113
  # %%
 
141
  return hallucination_percent, 'black'
142
 
143
  def visualize_leaderboard(df: pd.DataFrame) -> matplotlib.figure.Figure:
144
+ fig = plt.figure(figsize=(10, 5))
145
+ plot_df = df.head(10).copy()
 
 
146
  plot_df["normalized_hallucination_rate"] = min_max_scaler.fit_transform(plot_df[["Hallucination %"]])
147
 
148
+ # Reverse order so lowest hallucination is at top
149
+ plot_df = plot_df.iloc[::-1]
150
+ y_positions = range(len(plot_df))
151
 
152
+ plt.barh(y_positions, plot_df["Hallucination %"], color=plt.cm.RdYlGn_r(plot_df["normalized_hallucination_rate"]))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
153
 
154
+ # Add value labels to the right of bars and answer rate dots at bar end
155
+ for i, row in enumerate(plot_df.itertuples()):
156
+ plt.text(row._2 + 0.2, i, f"{row._2}%", ha='left', va='center', fontsize=8, fontweight='bold')
157
+ # Answer rate indicator - colored dot at end of bar
158
+ ar_dot_color = '#22aa22' if row._3 >= 95 else '#cc3333'
159
+ plt.scatter(row._2, i, color=ar_dot_color, s=25, zorder=5)
160
 
161
+ # Strip org prefix (e.g., "google/gemini-2.5" -> "gemini-2.5")
162
+ labels = [name.split("/")[-1] for name in plot_df["LLM"]]
163
+ plt.yticks(y_positions, labels, fontsize=8)
164
+ plt.xlabel("Hallucination Rate", fontsize=10)
165
  plt.title("Grounded Hallucination Rate of Best LLMs", fontsize=12)
166
+
167
  plt.gca().spines['top'].set_visible(False)
168
  plt.gca().spines['right'].set_visible(False)
169
+
170
+ # Add legend for answer rate dots
171
+ plt.scatter([], [], color='#22aa22', s=25, label='≥95%')
172
+ plt.scatter([], [], color='#cc3333', s=25, label='<95%')
173
+ plt.legend(loc='upper right', fontsize=8, framealpha=0.9, title='Answer Rate', title_fontsize=8)
174
+
175
+ plt.tight_layout()
176
+ plt.subplots_adjust(left=0.25, bottom=0.15)
177
+
178
+ # Add copyright at bottom
179
+ plt.figtext(0.5, 0.02, f"Copyright (2025) Vectara, Inc. - Plot generated on {datetime.now().strftime('%B %d, %Y')}",
180
+ ha='center', fontsize=10)
181
 
182
  return fig
183
 
184
  # %%
185
 
186
  if __name__ == "__main__":
187
+ df = load_results()
188
+ print(df)
 
189
 
190
  # %%
191
 
app/requirements.txt CHANGED
@@ -1,7 +1,5 @@
1
- funix==0.6.2
2
  pandas
3
- huggingface_hub==0.36.0
4
  matplotlib
5
- scikit-learn
6
- ipython
7
- git-lfs
 
1
+ gradio>=4.0.0
2
  pandas
3
+ huggingface_hub>=0.20.0
4
  matplotlib
5
+ scikit-learn
 
 
app/vectara_theme.py DELETED
@@ -1,29 +0,0 @@
1
- vectara_theme = {
2
- "name": "vectara",
3
- "funix": {
4
- "run_button": "Refresh",
5
- "grid_height": 960,
6
- "grid_checkbox": False
7
- },
8
- "overrides": {
9
- "MuiAppBar": {
10
- "styleOverrides": {
11
- "root": {
12
- "backgroundColor": "#ffffff",
13
- "color": "rgba(0, 0, 0, 0.87)",
14
- "& .MuiToolbar-root:before": {
15
- "content": '""',
16
- "background": "url('https://huggingface.co/spaces/vectara/README/resolve/main/Vectara-logo.png')",
17
- "display": "block",
18
- "background-size": "contain",
19
- "background-repeat": "no-repeat",
20
- "background-position": "left",
21
- "width": "125px",
22
- "height": "40px",
23
- "margin-right": "10px",
24
- },
25
- },
26
- }
27
- },
28
- },
29
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/utils.py CHANGED
@@ -20,7 +20,7 @@ class ColumnContent:
20
  hidden: bool = False
21
  never_hidden: bool = False
22
  dummy: bool = False
23
-
24
  ## Leaderboard columns
25
  auto_eval_column_dict = []
26
  # Init
 
20
  hidden: bool = False
21
  never_hidden: bool = False
22
  dummy: bool = False
23
+
24
  ## Leaderboard columns
25
  auto_eval_column_dict = []
26
  # Init