lataon commited on
Commit
dba24db
·
1 Parent(s): f3ebaf3

update: interface

Browse files
Makefile DELETED
@@ -1,18 +0,0 @@
1
- .PHONY: style format
2
-
3
-
4
- style:
5
- python -m black --line-length 119 .
6
- python -m isort .
7
- ruff check --fix .
8
-
9
-
10
- quality:
11
- python -m black --check --line-length 119 .
12
- python -m isort --check-only .
13
- ruff check .
14
-
15
-
16
- .PHONY: eval
17
- eval:
18
- python -m src.phoneme_eval
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,48 +1,56 @@
1
- ---
2
- title: Phoneme Detection Leaderboard
3
- emoji: 🥇
4
- colorFrom: green
5
- colorTo: indigo
6
- sdk: gradio
7
- app_file: app.py
8
- pinned: true
9
- license: apache-2.0
10
- short_description: Duplicate this leaderboard to initialize your own!
11
- sdk_version: 5.43.1
12
- tags:
13
- - leaderboard
14
- ---
15
-
16
- # Start the configuration
17
-
18
- Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
19
-
20
- Results files should have the following format and be stored as json files:
21
- ```json
22
- {
23
- "config": {
24
- "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
25
- "model_name": "path of the model on the hub: org/model",
26
- "model_sha": "revision on the hub",
27
- },
28
- "results": {
29
- "task_name": {
30
- "metric_name": score,
31
- },
32
- "task_name2": {
33
- "metric_name": score,
34
- }
35
- }
36
- }
37
  ```
38
 
39
- Request files are created automatically by this tool.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
 
41
- If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
42
 
43
- # Code logic for more complex edits
44
 
45
- You'll find
46
- - the main table' columns names and properties in `src/display/utils.py`
47
- - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
48
- - the logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
 
1
+ # Phoneme Detection Leaderboard
2
+
3
+ A clean, simplified phoneme detection leaderboard based on the open_asr_leaderboard interface.
4
+
5
+ ## Features
6
+
7
+ - **Clean Interface**: Uses the same interface structure as open_asr_leaderboard
8
+ - **Phoneme Evaluation**: Evaluates models on phoneme recognition tasks
9
+ - **Multiple Datasets**: Supports evaluation on multiple phoneme datasets
10
+ - **Model Request System**: Allows users to request evaluation of new models
11
+
12
+ ## Structure
13
+
14
+ ```
15
+ ├── app.py # Main Gradio application
16
+ ├── constants.py # Constants and text definitions
17
+ ├── utils_display.py # Display utilities and column definitions
18
+ ├── init.py # Initialization and hub integration
19
+ ├── phoneme_eval.py # Core phoneme evaluation logic
20
+ ├── utils/ # Utility modules
21
+ │ ├── load_model.py # Model loading and inference
22
+ │ ├── audio_process.py # Audio processing and PER calculation
23
+ │ └── cmu_process.py # CMU to IPA conversion
24
+ ├── requirements.txt # Python dependencies
25
+ └── README.md # This file
 
 
 
 
 
 
 
 
 
 
 
26
  ```
27
 
28
+ ## Usage
29
+
30
+ 1. Install dependencies:
31
+ ```bash
32
+ pip install -r requirements.txt
33
+ ```
34
+
35
+ 2. Run the application:
36
+ ```bash
37
+ python app.py
38
+ ```
39
+
40
+ 3. Run evaluation:
41
+ ```bash
42
+ python phoneme_eval.py
43
+ ```
44
+
45
+ ## Evaluation
46
+
47
+ The leaderboard evaluates models on:
48
+ - **PER (Phoneme Error Rate)**: Lower is better
49
+ - **Average Duration**: Processing time per sample
50
 
51
+ Models are ranked by Average PER across all datasets.
52
 
53
+ ## Datasets
54
 
55
+ - `phoneme_asr`: General phoneme recognition dataset
56
+ - `kids_phoneme_md`: Children's speech phoneme dataset
 
 
app.py CHANGED
@@ -1,15 +1,32 @@
 
 
 
1
  import os
2
  import glob
3
- import json
4
- import pandas as pd
5
- import gradio as gr
 
 
 
 
6
 
 
 
7
 
8
- ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
9
- EVAL_RESULTS_DIR = os.path.join(ROOT_DIR, "eval-results")
10
 
 
 
 
 
 
 
 
11
 
12
  def load_results(results_dir: str) -> pd.DataFrame:
 
13
  rows = []
14
  all_dataset_keys = set()
15
 
@@ -62,10 +79,9 @@ def load_results(results_dir: str) -> pd.DataFrame:
62
  avg_dur = sum(dur_values) / len(dur_values) if dur_values else None
63
 
64
  row = {
65
- "Model": model_name,
66
- "Avg PER": avg_per,
67
  "Avg Duration (s)": avg_dur,
68
- "_file": os.path.basename(path),
69
  }
70
  row.update(per_values)
71
  rows.append(row)
@@ -76,56 +92,198 @@ def load_results(results_dir: str) -> pd.DataFrame:
76
  df = pd.DataFrame(rows)
77
  if df.empty:
78
  # Create default columns based on discovered datasets
79
- default_cols = ["Model", "Avg PER", "Avg Duration (s)"]
80
  for key in sorted(all_dataset_keys):
81
  display_name = dataset_display_names[key]
82
  default_cols.insert(-2, f"PER {display_name}")
83
  return pd.DataFrame(columns=default_cols)
84
 
85
- df = df.sort_values(by=["Avg PER"], ascending=True, na_position="last")
86
  return df.reset_index(drop=True)
87
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
88
 
89
- def build_interface():
90
- with gr.Blocks() as demo:
91
- gr.Markdown("# Simple Phoneme Leaderboard")
92
- info = gr.Markdown(f"Results directory: `{EVAL_RESULTS_DIR}`")
93
-
94
- # Get initial data to determine columns dynamically
95
- initial_df = load_results(EVAL_RESULTS_DIR)
96
- if not initial_df.empty:
97
- headers = list(initial_df.columns)
98
- # Remove internal columns
99
- headers = [h for h in headers if not h.startswith('_')]
100
- else:
101
- headers = ["Model", "Avg PER", "Avg Duration (s)"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
- table = gr.Dataframe(headers=headers, row_count=5)
104
 
105
- def refresh():
106
- df = load_results(EVAL_RESULTS_DIR)
107
- if df.empty:
108
- return df
109
-
110
- # Get the column order from the dataframe
111
- cols = [c for c in df.columns if not c.startswith('_')]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
 
113
- # Ensure all columns exist for the dataframe component
114
- for c in cols:
115
- if c not in df.columns:
116
- df[c] = None
117
- return df[cols].round(3)
 
 
 
 
 
 
118
 
119
- btn = gr.Button("Refresh")
120
- btn.click(fn=refresh, outputs=table)
 
 
 
 
 
 
 
121
 
122
- # Auto-load on start
123
- table.value = refresh()
124
- return demo
 
125
 
 
 
 
126
 
127
- if __name__ == "__main__":
128
- demo = build_interface()
129
- demo.queue().launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
130
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
 
 
1
+ import gradio as gr
2
+ import pandas as pd
3
+ import json
4
  import os
5
  import glob
6
+ from constants import BANNER, INTRODUCTION_TEXT, CITATION_TEXT, METRICS_TAB_TEXT, DIR_OUTPUT_REQUESTS, LEADERBOARD_CSS
7
+ from init import is_model_on_hub, upload_file, load_all_info_from_dataset_hub
8
+ from utils_display import PhonemeEvalColumn, fields, make_clickable_model, styled_error, styled_message
9
+ import numpy as np
10
+ from datetime import datetime, timezone
11
+
12
+ LAST_UPDATED = "Oct 2nd 2025"
13
 
14
+ # Global variable to store detailed benchmark data
15
+ benchmark_details = {}
16
 
17
+ # Directory for evaluation results
18
+ EVAL_RESULTS_DIR = os.path.join(os.path.dirname(os.path.abspath(__file__)), "eval-results")
19
 
20
+ column_names = {
21
+ "model": "Model",
22
+ "avg_per": "Average PER ⬇️",
23
+ "avg_duration": "Avg Duration (s)",
24
+ "per_phoneme_asr": "PER phoneme_asr",
25
+ "per_kids_phoneme_md": "PER kids_phoneme_md",
26
+ }
27
 
28
  def load_results(results_dir: str) -> pd.DataFrame:
29
+ """Load results from JSON files in the results directory"""
30
  rows = []
31
  all_dataset_keys = set()
32
 
 
79
  avg_dur = sum(dur_values) / len(dur_values) if dur_values else None
80
 
81
  row = {
82
+ "Model": make_clickable_model(model_name),
83
+ "Average PER ⬇️": avg_per,
84
  "Avg Duration (s)": avg_dur,
 
85
  }
86
  row.update(per_values)
87
  rows.append(row)
 
92
  df = pd.DataFrame(rows)
93
  if df.empty:
94
  # Create default columns based on discovered datasets
95
+ default_cols = ["Model", "Average PER ⬇️", "Avg Duration (s)"]
96
  for key in sorted(all_dataset_keys):
97
  display_name = dataset_display_names[key]
98
  default_cols.insert(-2, f"PER {display_name}")
99
  return pd.DataFrame(columns=default_cols)
100
 
101
+ df = df.sort_values(by=["Average PER ⬇️"], ascending=True, na_position="last")
102
  return df.reset_index(drop=True)
103
 
104
+ # Load initial data
105
+ try:
106
+ eval_queue_repo, requested_models, csv_results = load_all_info_from_dataset_hub()
107
+ if csv_results and csv_results.exists():
108
+ original_df = pd.read_csv(csv_results)
109
+ # Format the columns
110
+ def formatter(x):
111
+ if type(x) is str:
112
+ x = x
113
+ elif x == -1:
114
+ x = "NA"
115
+ else:
116
+ x = round(x, 2)
117
+ return x
118
 
119
+ for col in original_df.columns:
120
+ if col == "model":
121
+ original_df[col] = original_df[col].apply(lambda x: make_clickable_model(x))
122
+ else:
123
+ original_df[col] = original_df[col].apply(formatter)
124
+ # Only rename columns that exist in the dataframe
125
+ existing_columns = {k: v for k, v in column_names.items() if k in original_df.columns}
126
+ original_df.rename(columns=existing_columns, inplace=True)
127
+ if 'Average PER ⬇️' in original_df.columns:
128
+ original_df.sort_values(by='Average PER ⬇️', inplace=True)
129
+ else:
130
+ # Fallback to local results
131
+ original_df = load_results(EVAL_RESULTS_DIR)
132
+ except Exception as e:
133
+ print(f"Error loading data: {e}")
134
+ # Fallback to local results
135
+ original_df = load_results(EVAL_RESULTS_DIR)
136
+
137
+ # If no data is loaded, create a sample empty dataframe with proper columns
138
+ if original_df.empty:
139
+ print("No results found. Creating empty dataframe with sample data...")
140
+ # Create sample data to demonstrate the interface
141
+ sample_data = {
142
+ "Model": [make_clickable_model("sample/hubert-base"), make_clickable_model("sample/whisper-base")],
143
+ "Average PER ⬇️": [15.2, 18.5],
144
+ "Avg Duration (s)": [0.12, 0.15],
145
+ "PER phoneme_asr": [14.8, 17.2],
146
+ "PER kids_phoneme_md": [15.6, 19.8]
147
+ }
148
+ original_df = pd.DataFrame(sample_data)
149
+ print("Sample data created for demonstration.")
150
+
151
+ COLS = [c.name for c in fields(PhonemeEvalColumn)]
152
+ TYPES = [c.type for c in fields(PhonemeEvalColumn)]
153
+
154
+ def request_model(model_text, chb_phoneme_asr, chb_kids_phoneme_md):
155
+
156
+ # Determine the selected checkboxes
157
+ dataset_selection = []
158
+ if chb_phoneme_asr:
159
+ dataset_selection.append("phoneme_asr")
160
+ if chb_kids_phoneme_md:
161
+ dataset_selection.append("kids_phoneme_md")
162
+
163
+ if len(dataset_selection) == 0:
164
+ return styled_error("You need to select at least one dataset")
165
 
166
+ base_model_on_hub, error_msg = is_model_on_hub(model_text)
167
 
168
+ if not base_model_on_hub:
169
+ return styled_error(f"Base model '{model_text}' {error_msg}")
170
+
171
+ # Construct the output dictionary
172
+ current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
173
+ required_datasets = ', '.join(dataset_selection)
174
+ eval_entry = {
175
+ "date": current_time,
176
+ "model": model_text,
177
+ "datasets_selected": required_datasets
178
+ }
179
+
180
+ # Prepare file path
181
+ DIR_OUTPUT_REQUESTS.mkdir(parents=True, exist_ok=True)
182
+
183
+ fn_datasets = '@ '.join(dataset_selection)
184
+ filename = model_text.replace("/","@") + "@@" + fn_datasets
185
+ if filename in requested_models:
186
+ return styled_error(f"A request for this model '{model_text}' and dataset(s) was already made.")
187
+ try:
188
+ filename_ext = filename + ".txt"
189
+ out_filepath = DIR_OUTPUT_REQUESTS / filename_ext
190
+
191
+ # Write the results to a text file
192
+ with open(out_filepath, "w") as f:
193
+ f.write(json.dumps(eval_entry))
194
 
195
+ upload_file(filename, out_filepath)
196
+
197
+ # Include file in the list of uploaded files
198
+ requested_models.append(filename)
199
+
200
+ # Remove the local file
201
+ out_filepath.unlink()
202
+
203
+ return styled_message("🤗 Your request has been submitted and will be evaluated soon!</p>")
204
+ except Exception as e:
205
+ return styled_error(f"Error submitting request!")
206
 
207
+ def filter_main_table(show_proprietary=True):
208
+ filtered_df = original_df.copy()
209
+
210
+ # Filter proprietary models if needed
211
+ if not show_proprietary and "License" in filtered_df.columns:
212
+ # Keep only models with "Open" license
213
+ filtered_df = filtered_df[filtered_df["License"] == "Open"]
214
+
215
+ return filtered_df
216
 
217
+ def refresh_results():
218
+ """Refresh the results from the eval-results directory"""
219
+ updated_df = load_results(EVAL_RESULTS_DIR)
220
+ return updated_df
221
 
222
+ with gr.Blocks(css=LEADERBOARD_CSS) as demo:
223
+ # gr.HTML(BANNER, elem_id="banner")
224
+ gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
225
 
226
+ with gr.Tabs(elem_classes="tab-buttons") as tabs:
227
+ with gr.TabItem("🏅 Leaderboard", elem_id="phoneme-benchmark-tab-table", id=0):
228
+ leaderboard_table = gr.components.Dataframe(
229
+ value=original_df,
230
+ datatype=TYPES,
231
+ elem_id="leaderboard-table",
232
+ interactive=False,
233
+ visible=True,
234
+ )
235
+ with gr.Row():
236
+ show_proprietary_checkbox = gr.Checkbox(
237
+ label="Show proprietary models",
238
+ value=True,
239
+ elem_id="show-proprietary-checkbox"
240
+ )
241
+ refresh_button = gr.Button("🔄 Refresh Results", variant="secondary")
242
+
243
+ # Connect checkbox to the filtering function
244
+ show_proprietary_checkbox.change(
245
+ filter_main_table,
246
+ inputs=[show_proprietary_checkbox],
247
+ outputs=leaderboard_table
248
+ )
249
+
250
+ # Connect refresh button
251
+ refresh_button.click(
252
+ refresh_results,
253
+ outputs=leaderboard_table
254
+ )
255
 
256
+ with gr.TabItem("📈 Metrics", elem_id="phoneme-benchmark-tab-table", id=1):
257
+ gr.Markdown(METRICS_TAB_TEXT, elem_classes="markdown-text")
258
+
259
+ with gr.TabItem("✉️✨ Request a model here!", elem_id="phoneme-benchmark-tab-table", id=2):
260
+ with gr.Column():
261
+ gr.Markdown("# ✉️✨ Request results for a new model here!", elem_classes="markdown-text")
262
+ with gr.Column():
263
+ gr.Markdown("Select datasets:", elem_classes="markdown-text")
264
+ with gr.Column():
265
+ model_name_textbox = gr.Textbox(label="Model name (user_name/model_name)")
266
+ chb_phoneme_asr = gr.Checkbox(label="phoneme_asr dataset", value=True)
267
+ chb_kids_phoneme_md = gr.Checkbox(label="kids_phoneme_md dataset", value=True)
268
+ with gr.Column():
269
+ mdw_submission_result = gr.Markdown()
270
+ btn_submitt = gr.Button(value="🚀 Request")
271
+ btn_submitt.click(request_model,
272
+ [model_name_textbox, chb_phoneme_asr, chb_kids_phoneme_md],
273
+ mdw_submission_result)
274
+ # add an about section
275
+ with gr.TabItem("🤗 About", elem_id="phoneme-benchmark-tab-table", id=3):
276
+ gr.Markdown("## About", elem_classes="markdown-text")
277
+
278
+ gr.Markdown(f"Last updated on **{LAST_UPDATED}**", elem_classes="markdown-text")
279
+
280
+ with gr.Row():
281
+ with gr.Accordion("📙 Citation", open=False):
282
+ gr.Textbox(
283
+ value=CITATION_TEXT, lines=7,
284
+ label="Copy the BibTeX snippet to cite this source",
285
+ elem_id="citation-button",
286
+ show_copy_button=True,
287
+ )
288
 
289
+ demo.launch(ssr_mode=False)
app_default.py DELETED
@@ -1,463 +0,0 @@
1
- import gradio as gr
2
- from gradio_leaderboard import Leaderboard, ColumnFilter, SelectColumns
3
- import pandas as pd
4
- from apscheduler.schedulers.background import BackgroundScheduler
5
- from huggingface_hub import snapshot_download
6
- import os
7
-
8
- from src.about import (
9
- CITATION_BUTTON_LABEL,
10
- CITATION_BUTTON_TEXT,
11
- EVALUATION_QUEUE_TEXT,
12
- INTRODUCTION_TEXT,
13
- LLM_BENCHMARKS_TEXT,
14
- TITLE,
15
- )
16
- from src.display.css_html_js import custom_css
17
- from src.display.utils import (
18
- COLS,
19
- AutoEvalColumn,
20
- fields,
21
- )
22
- from src.about import Tasks
23
- from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
24
- from src.populate import get_evaluation_queue_df, get_leaderboard_df
25
- from src.submission.submit import add_new_eval
26
-
27
- # Import simple leaderboard functionality
28
- import glob
29
- import json
30
- from functools import lru_cache
31
-
32
-
33
- def restart_space():
34
- API.restart_space(repo_id=REPO_ID)
35
-
36
- ### Space initialisation (prefer local JSONs, fall back to Hub)
37
- def _has_local_json(path: str) -> bool:
38
- try:
39
- return os.path.isdir(path) and any(str(f).endswith(".json") for f in os.listdir(path))
40
- except Exception:
41
- return False
42
-
43
- if not _has_local_json(EVAL_REQUESTS_PATH):
44
- try:
45
- print(EVAL_REQUESTS_PATH)
46
- snapshot_download(
47
- repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
48
- )
49
- except Exception:
50
- pass
51
-
52
- if not _has_local_json(EVAL_RESULTS_PATH):
53
- try:
54
- print(EVAL_RESULTS_PATH)
55
- snapshot_download(
56
- repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", tqdm_class=None, etag_timeout=30, token=TOKEN
57
- )
58
- except Exception:
59
- pass
60
-
61
-
62
- # Build benchmark and evaluation queue column metadata
63
- BENCHMARK_COLS = [f"{task.value.col_name} ({task.name})" for task in Tasks]
64
-
65
- EVAL_COLS = [
66
- "Model",
67
- "Model sha",
68
- "status",
69
- "precision",
70
- "weight_type",
71
- "model_type",
72
- "likes",
73
- "params",
74
- "license",
75
- "submitted_time",
76
- ]
77
-
78
- EVAL_TYPES = [
79
- "markdown", # Model
80
- "str", # Model sha
81
- "str", # status
82
- "str", # precision
83
- "str", # weight_type
84
- "str", # model_type
85
- "number", # likes
86
- "number", # params
87
- "str", # license
88
- "str", # submitted_time
89
- ]
90
-
91
- # Hide all models from the leaderboard view
92
- LEADERBOARD_DF = pd.DataFrame(columns=COLS)
93
-
94
- (
95
- finished_eval_queue_df,
96
- running_eval_queue_df,
97
- pending_eval_queue_df,
98
- ) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS)
99
-
100
- @lru_cache(maxsize=1)
101
- def _get_simple_dataset_keys(results_dir: str) -> tuple:
102
- """Cache dataset keys to avoid repeated file scanning."""
103
- all_dataset_keys = set()
104
- if not os.path.isdir(results_dir):
105
- return tuple()
106
-
107
- for path in glob.glob(os.path.join(results_dir, "*.json")):
108
- try:
109
- with open(path, "r", encoding="utf-8") as f:
110
- data = json.load(f)
111
- res = data.get("results", {})
112
- all_dataset_keys.update(res.keys())
113
- except Exception:
114
- continue
115
-
116
- return tuple(sorted(all_dataset_keys))
117
-
118
- def load_simple_results(results_dir: str) -> pd.DataFrame:
119
- """Load and process evaluation results from JSON files for simple leaderboard with caching."""
120
- rows = []
121
- all_dataset_keys = set(_get_simple_dataset_keys(results_dir))
122
-
123
- if not all_dataset_keys:
124
- return pd.DataFrame(columns=["Model", "Avg PER", "Avg Duration (s)"])
125
-
126
- # Use dataset keys directly as display names
127
- dataset_display_names = {key: key for key in all_dataset_keys}
128
-
129
- # Single pass: extract data with optimized processing
130
- for path in glob.glob(os.path.join(results_dir, "*.json")):
131
- try:
132
- with open(path, "r", encoding="utf-8") as f:
133
- data = json.load(f)
134
- cfg = data.get("config", {})
135
- res = data.get("results", {})
136
-
137
- model_name = cfg.get("model_name", "unknown")
138
-
139
- # Extract PER for each dataset dynamically
140
- per_values = {}
141
- dur_values = []
142
-
143
- for dataset_key in all_dataset_keys:
144
- dataset_data = res.get(dataset_key, {})
145
- per_value = dataset_data.get("per") if dataset_data else None
146
- dur_value = dataset_data.get("avg_duration") if dataset_data else None
147
-
148
- display_name = dataset_display_names[dataset_key]
149
- per_values[f"PER {display_name}"] = per_value
150
-
151
- if dur_value is not None:
152
- dur_values.append(dur_value)
153
-
154
- # Calculate average PER across all datasets
155
- per_vals = [v for v in per_values.values() if v is not None]
156
- avg_per = sum(per_vals) / len(per_vals) if per_vals else None
157
-
158
- # Calculate average duration
159
- avg_dur = sum(dur_values) / len(dur_values) if dur_values else None
160
-
161
- row = {
162
- "Model": model_name,
163
- "Avg PER": avg_per,
164
- "Avg Duration (s)": avg_dur,
165
- "_file": os.path.basename(path),
166
- }
167
- row.update(per_values)
168
- rows.append(row)
169
-
170
- except Exception:
171
- continue
172
-
173
- df = pd.DataFrame(rows)
174
- if df.empty:
175
- # Create default columns based on discovered datasets
176
- default_cols = ["Model", "Avg PER", "Avg Duration (s)"]
177
- for key in sorted(all_dataset_keys):
178
- display_name = dataset_display_names[key]
179
- default_cols.insert(-2, f"PER {display_name}")
180
- return pd.DataFrame(columns=default_cols)
181
-
182
- df = df.sort_values(by=["Avg PER"], ascending=True, na_position="last")
183
- return df.reset_index(drop=True)
184
-
185
-
186
- def init_leaderboard(dataframe):
187
- if dataframe is None or dataframe.empty:
188
- dataframe = pd.DataFrame(columns=[c.name for c in fields(AutoEvalColumn)])
189
- return Leaderboard(
190
- value=dataframe,
191
- datatype=[c.type for c in fields(AutoEvalColumn)],
192
- select_columns=SelectColumns(
193
- default_selection=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default],
194
- cant_deselect=[c.name for c in fields(AutoEvalColumn) if c.never_hidden],
195
- label="Select Columns to Display:",
196
- ),
197
- search_columns=[AutoEvalColumn.model.name, AutoEvalColumn.license.name],
198
- hide_columns=[c.name for c in fields(AutoEvalColumn) if c.hidden],
199
- filter_columns=[
200
- ColumnFilter(AutoEvalColumn.model_type.name, type="checkboxgroup", label="Model types"),
201
- ColumnFilter(AutoEvalColumn.precision.name, type="checkboxgroup", label="Precision"),
202
- ColumnFilter(
203
- AutoEvalColumn.params.name,
204
- type="slider",
205
- min=0.01,
206
- max=150,
207
- label="Select the number of parameters (B)",
208
- ),
209
- ColumnFilter(
210
- AutoEvalColumn.still_on_hub.name, type="boolean", label="Deleted/incomplete", default=True
211
- ),
212
- ],
213
- bool_checkboxgroup_label="Hide models",
214
- interactive=False,
215
- )
216
-
217
-
218
- demo = gr.Blocks(css=custom_css)
219
- with demo:
220
- gr.HTML(TITLE)
221
- gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
222
-
223
- with gr.Tabs(elem_classes="tab-buttons") as tabs:
224
- with gr.TabItem("🏅 Phoneme Benchmark", elem_id="llm-benchmark-tab-table", id=0):
225
- leaderboard = init_leaderboard(LEADERBOARD_DF)
226
-
227
- with gr.TabItem("📊 Simple Results", elem_id="simple-results-tab", id=1):
228
- gr.Markdown("## 🎯 Phoneme Detection Results")
229
- gr.Markdown("Compare phoneme recognition models across different datasets")
230
-
231
- # Stats section for simple results
232
- with gr.Row():
233
- simple_total_models = gr.HTML(
234
- '<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">-</div><div style="font-size: 0.9rem; opacity: 0.9;">Total Models</div></div>'
235
- )
236
- simple_best_per = gr.HTML(
237
- '<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">-</div><div style="font-size: 0.9rem; opacity: 0.9;">Best PER</div></div>'
238
- )
239
- simple_avg_duration = gr.HTML(
240
- '<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">-</div><div style="font-size: 0.9rem; opacity: 0.9;">Avg Duration</div></div>'
241
- )
242
-
243
- # Get initial data to determine columns dynamically
244
- initial_df = load_simple_results(EVAL_RESULTS_PATH)
245
- if not initial_df.empty:
246
- headers = list(initial_df.columns)
247
- # Remove internal columns
248
- headers = [h for h in headers if not h.startswith('_')]
249
- else:
250
- headers = ["Model", "Avg PER", "Avg Duration (s)"]
251
-
252
- with gr.Row():
253
- with gr.Column(scale=4):
254
- simple_table = gr.Dataframe(
255
- headers=headers,
256
- row_count=10,
257
- label="🏆 Model Performance Leaderboard",
258
- interactive=False
259
- )
260
-
261
- with gr.Column(scale=1):
262
- refresh_btn = gr.Button("🔄 Refresh Data", variant="primary")
263
-
264
- # Export options
265
- with gr.Accordion("📥 Export Data", open=False):
266
- export_csv = gr.Button("📄 Export CSV", variant="secondary")
267
- export_json = gr.Button("📋 Export JSON", variant="secondary")
268
-
269
- def refresh_simple():
270
- """Refresh the simple leaderboard data with enhanced stats."""
271
- df = load_simple_results(EVAL_RESULTS_PATH)
272
-
273
- if df.empty:
274
- return df, "No data", "No data", "No data"
275
-
276
- # Get the column order from the dataframe
277
- cols = [c for c in df.columns if not c.startswith('_')]
278
-
279
- # Ensure all columns exist for the dataframe component
280
- for c in cols:
281
- if c not in df.columns:
282
- df[c] = None
283
-
284
- # Calculate enhanced stats
285
- total_models = len(df)
286
- best_per_val = df['Avg PER'].min() if 'Avg PER' in df.columns and not df['Avg PER'].isna().all() else "N/A"
287
- avg_duration_val = df['Avg Duration (s)'].mean() if 'Avg Duration (s)' in df.columns and not df['Avg Duration (s)'].isna().all() else "N/A"
288
-
289
- # Format stats
290
- best_per_str = f"{best_per_val:.2f}" if isinstance(best_per_val, (int, float)) else str(best_per_val)
291
- avg_duration_str = f"{avg_duration_val:.2f}s" if isinstance(avg_duration_val, (int, float)) else str(avg_duration_val)
292
-
293
- return (
294
- df[cols].round(3),
295
- f'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">{total_models}</div><div style="font-size: 0.9rem; opacity: 0.9;">Total Models</div></div>',
296
- f'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">{best_per_str}</div><div style="font-size: 0.9rem; opacity: 0.9;">Best PER</div></div>',
297
- f'<div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); color: white; padding: 1rem; border-radius: 10px; text-align: center; min-width: 150px;"><div style="font-size: 1.5rem; font-weight: bold;">{avg_duration_str}</div><div style="font-size: 0.9rem; opacity: 0.9;">Avg Duration</div></div>'
298
- )
299
-
300
- def export_simple_csv():
301
- """Export simple results as CSV."""
302
- df = load_simple_results(EVAL_RESULTS_PATH)
303
- if df.empty:
304
- return None
305
- cols = [c for c in df.columns if not c.startswith('_')]
306
- return df[cols].round(3)
307
-
308
- def export_simple_json():
309
- """Export simple results as JSON."""
310
- df = load_simple_results(EVAL_RESULTS_PATH)
311
- if df.empty:
312
- return None
313
- cols = [c for c in df.columns if not c.startswith('_')]
314
- return df[cols].round(3).to_json(orient='records', indent=2)
315
-
316
- # Connect events
317
- refresh_btn.click(
318
- fn=refresh_simple,
319
- outputs=[simple_table, simple_total_models, simple_best_per, simple_avg_duration]
320
- )
321
-
322
- export_csv.click(
323
- fn=export_simple_csv,
324
- outputs=gr.File(label="Download CSV")
325
- )
326
-
327
- export_json.click(
328
- fn=export_simple_json,
329
- outputs=gr.File(label="Download JSON")
330
- )
331
-
332
- # Auto-load on start
333
- simple_table.value, simple_total_models.value, simple_best_per.value, simple_avg_duration.value = refresh_simple()
334
-
335
- # Enhanced help section
336
- with gr.Accordion("ℹ️ About this Leaderboard", open=False):
337
- gr.Markdown("""
338
- ## 📊 Understanding the Results
339
-
340
- **Performance Metrics:**
341
- - **PER (Phoneme Error Rate)**: Lower values indicate better performance
342
- - **Avg Duration**: Processing time per sample (lower is faster)
343
- - **Models are ranked by average PER across all datasets**
344
-
345
- **Datasets Evaluated:**
346
- - `phoneme_asr`: General phoneme recognition dataset
347
- - `kids_phoneme_md`: Kids' phoneme recognition dataset
348
-
349
- **How to Interpret:**
350
- - **PER**: Percentage of phonemes incorrectly recognized (0% = perfect)
351
- - **Duration**: Time efficiency (important for real-time applications)
352
- - **Average PER**: Overall model performance across all datasets
353
-
354
- **Tips for Model Selection:**
355
- - Choose models with low PER for accuracy-critical applications
356
- - Consider duration for real-time or resource-constrained environments
357
- - Balance between accuracy (PER) and speed (Duration) based on your needs
358
- """)
359
-
360
- with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
361
- gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
362
-
363
- with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=3):
364
- with gr.Column():
365
- with gr.Row():
366
- gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
367
-
368
- with gr.Column():
369
- with gr.Accordion(
370
- f"✅ Finished Evaluations ({len(finished_eval_queue_df)})",
371
- open=False,
372
- ):
373
- with gr.Row():
374
- finished_eval_table = gr.components.Dataframe(
375
- value=finished_eval_queue_df,
376
- headers=EVAL_COLS,
377
- datatype=EVAL_TYPES,
378
- row_count=5,
379
- )
380
- with gr.Accordion(
381
- f"🔄 Running Evaluation Queue ({len(running_eval_queue_df)})",
382
- open=False,
383
- ):
384
- with gr.Row():
385
- running_eval_table = gr.components.Dataframe(
386
- value=running_eval_queue_df,
387
- headers=EVAL_COLS,
388
- datatype=EVAL_TYPES,
389
- row_count=5,
390
- )
391
-
392
- with gr.Accordion(
393
- f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})",
394
- open=False,
395
- ):
396
- with gr.Row():
397
- pending_eval_table = gr.components.Dataframe(
398
- value=pending_eval_queue_df,
399
- headers=EVAL_COLS,
400
- datatype=EVAL_TYPES,
401
- row_count=5,
402
- )
403
- with gr.Row():
404
- gr.Markdown("# ✉️✨ Submit your model here!", elem_classes="markdown-text")
405
-
406
- with gr.Row():
407
- with gr.Column():
408
- model_name_textbox = gr.Textbox(label="Model name")
409
- revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main")
410
- model_type = gr.Dropdown(
411
- choices=["Pretrained", "Fine-tuned", "Merge", "Other"],
412
- label="Model type",
413
- multiselect=False,
414
- value=None,
415
- interactive=True,
416
- )
417
-
418
- with gr.Column():
419
- precision = gr.Dropdown(
420
- choices=["float16", "bfloat16", "float32", "int8", "int4"],
421
- label="Precision",
422
- multiselect=False,
423
- value="float16",
424
- interactive=True,
425
- )
426
- weight_type = gr.Dropdown(
427
- choices=["Original", "Delta", "Adapter"],
428
- label="Weights type",
429
- multiselect=False,
430
- value="Original",
431
- interactive=True,
432
- )
433
- base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)")
434
-
435
- submit_button = gr.Button("Submit Eval")
436
- submission_result = gr.Markdown()
437
- submit_button.click(
438
- add_new_eval,
439
- [
440
- model_name_textbox,
441
- base_model_name_textbox,
442
- revision_name_textbox,
443
- precision,
444
- weight_type,
445
- model_type,
446
- ],
447
- submission_result,
448
- )
449
-
450
- with gr.Row():
451
- with gr.Accordion("📙 Citation", open=False):
452
- citation_button = gr.Textbox(
453
- value=CITATION_BUTTON_TEXT,
454
- label=CITATION_BUTTON_LABEL,
455
- lines=20,
456
- elem_id="citation-button",
457
- show_copy_button=True,
458
- )
459
-
460
- scheduler = BackgroundScheduler()
461
- scheduler.add_job(restart_space, "interval", seconds=1800)
462
- scheduler.start()
463
- demo.queue(default_concurrency_limit=40).launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app_simple.py DELETED
@@ -1,318 +0,0 @@
1
- import os
2
- import glob
3
- import json
4
- import pandas as pd
5
- import gradio as gr
6
- from typing import Optional, Dict, List
7
- import time
8
- from functools import lru_cache
9
-
10
- ROOT_DIR = os.path.dirname(os.path.abspath(__file__))
11
- EVAL_RESULTS_DIR = os.path.join(ROOT_DIR, "eval-results")
12
-
13
-
14
- @lru_cache(maxsize=1)
15
- def _get_dataset_keys(results_dir: str) -> tuple:
16
- """Cache dataset keys to avoid repeated file scanning."""
17
- all_dataset_keys = set()
18
- if not os.path.isdir(results_dir):
19
- return tuple()
20
-
21
- for path in glob.glob(os.path.join(results_dir, "*.json")):
22
- try:
23
- with open(path, "r", encoding="utf-8") as f:
24
- data = json.load(f)
25
- res = data.get("results", {})
26
- all_dataset_keys.update(res.keys())
27
- except Exception:
28
- continue
29
-
30
- return tuple(sorted(all_dataset_keys))
31
-
32
- def load_results(results_dir: str) -> pd.DataFrame:
33
- """
34
- Load and process evaluation results from JSON files.
35
- Dynamically handles any number of datasets with caching for performance.
36
- """
37
- rows = []
38
- all_dataset_keys = set(_get_dataset_keys(results_dir))
39
-
40
- if not all_dataset_keys:
41
- return pd.DataFrame(columns=["Model", "Avg PER", "Avg Duration (s)"])
42
-
43
- # Use dataset keys directly as display names
44
- dataset_display_names = {key: key for key in all_dataset_keys}
45
-
46
- # Single pass: extract data with optimized processing
47
- for path in glob.glob(os.path.join(results_dir, "*.json")):
48
- try:
49
- with open(path, "r", encoding="utf-8") as f:
50
- data = json.load(f)
51
- cfg = data.get("config", {})
52
- res = data.get("results", {})
53
-
54
- model_name = cfg.get("model_name", "unknown")
55
-
56
- # Extract PER for each dataset dynamically
57
- per_values = {}
58
- dur_values = []
59
-
60
- for dataset_key in all_dataset_keys:
61
- dataset_data = res.get(dataset_key, {})
62
- per_value = dataset_data.get("per") if dataset_data else None
63
- dur_value = dataset_data.get("avg_duration") if dataset_data else None
64
-
65
- display_name = dataset_display_names[dataset_key]
66
- per_values[f"PER {display_name}"] = per_value
67
-
68
- if dur_value is not None:
69
- dur_values.append(dur_value)
70
-
71
- # Calculate average PER across all datasets
72
- per_vals = [v for v in per_values.values() if v is not None]
73
- avg_per = sum(per_vals) / len(per_vals) if per_vals else None
74
-
75
- # Calculate average duration
76
- avg_dur = sum(dur_values) / len(dur_values) if dur_values else None
77
-
78
- row = {
79
- "Model": model_name,
80
- "Avg PER": avg_per,
81
- "Avg Duration (s)": avg_dur,
82
- "_file": os.path.basename(path),
83
- }
84
- row.update(per_values)
85
- rows.append(row)
86
-
87
- except Exception:
88
- continue
89
-
90
- df = pd.DataFrame(rows)
91
- if df.empty:
92
- # Create default columns based on discovered datasets
93
- default_cols = ["Model", "Avg PER", "Avg Duration (s)"]
94
- for key in sorted(all_dataset_keys):
95
- display_name = dataset_display_names[key]
96
- default_cols.insert(-2, f"PER {display_name}")
97
- return pd.DataFrame(columns=default_cols)
98
-
99
- df = df.sort_values(by=["Avg PER"], ascending=True, na_position="last")
100
- return df.reset_index(drop=True)
101
-
102
-
103
- def build_interface():
104
- """Build the optimized Gradio interface for the phoneme leaderboard."""
105
-
106
- # Custom CSS for better styling
107
- custom_css = """
108
- .gradio-container {
109
- max-width: 1200px !important;
110
- margin: 0 auto !important;
111
- }
112
- .leaderboard-header {
113
- text-align: center;
114
- margin-bottom: 2rem;
115
- }
116
- .stats-container {
117
- display: flex;
118
- gap: 1rem;
119
- margin-bottom: 1rem;
120
- flex-wrap: wrap;
121
- }
122
- .stat-card {
123
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
124
- color: white;
125
- padding: 1rem;
126
- border-radius: 10px;
127
- text-align: center;
128
- min-width: 150px;
129
- flex: 1;
130
- }
131
- .stat-value {
132
- font-size: 1.5rem;
133
- font-weight: bold;
134
- margin-bottom: 0.5rem;
135
- }
136
- .stat-label {
137
- font-size: 0.9rem;
138
- opacity: 0.9;
139
- }
140
- .table-container {
141
- margin-top: 1rem;
142
- }
143
- .refresh-btn {
144
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
145
- color: white;
146
- border: none;
147
- padding: 0.5rem 1rem;
148
- border-radius: 5px;
149
- cursor: pointer;
150
- }
151
- """
152
-
153
- with gr.Blocks(
154
- title="Phoneme Detection Leaderboard",
155
- css=custom_css,
156
- theme=gr.themes.Soft()
157
- ) as demo:
158
-
159
- # Header section
160
- with gr.Column(elem_classes="leaderboard-header"):
161
- gr.Markdown("# 🎯 Phoneme Detection Leaderboard")
162
- gr.Markdown("Compare phoneme recognition models across different datasets")
163
-
164
- # Stats section
165
- with gr.Row(elem_classes="stats-container"):
166
- total_models = gr.HTML(
167
- '<div class="stat-card"><div class="stat-value" id="total-models">-</div><div class="stat-label">Total Models</div></div>',
168
- elem_id="total-models-card"
169
- )
170
- best_per = gr.HTML(
171
- '<div class="stat-card"><div class="stat-value" id="best-per">-</div><div class="stat-label">Best PER</div></div>',
172
- elem_id="best-per-card"
173
- )
174
- avg_duration = gr.HTML(
175
- '<div class="stat-card"><div class="stat-value" id="avg-duration">-</div><div class="stat-label">Avg Duration</div></div>',
176
- elem_id="avg-duration-card"
177
- )
178
-
179
- # Main content
180
- with gr.Row():
181
- with gr.Column(scale=4):
182
- # Get initial data to determine columns dynamically
183
- initial_df = load_results(EVAL_RESULTS_DIR)
184
- if not initial_df.empty:
185
- headers = list(initial_df.columns)
186
- # Remove internal columns
187
- headers = [h for h in headers if not h.startswith('_')]
188
- else:
189
- headers = ["Model", "Avg PER", "Avg Duration (s)"]
190
-
191
- table = gr.Dataframe(
192
- headers=headers,
193
- row_count=10,
194
- label="🏆 Model Performance Leaderboard",
195
- interactive=False,
196
- elem_classes="table-container"
197
- )
198
-
199
- with gr.Column(scale=1):
200
- refresh_btn = gr.Button(
201
- "🔄 Refresh Data",
202
- variant="primary",
203
- elem_classes="refresh-btn"
204
- )
205
-
206
- # Quick stats
207
- with gr.Accordion("📊 Quick Stats", open=True):
208
- stats_display = gr.HTML("Loading statistics...")
209
-
210
- # Export options
211
- with gr.Accordion("📥 Export Data", open=False):
212
- export_csv = gr.Button("📄 Export as CSV", variant="secondary")
213
- export_json = gr.Button("📋 Export as JSON", variant="secondary")
214
-
215
- def refresh():
216
- """Refresh the leaderboard data with performance optimization."""
217
- start_time = time.time()
218
- df = load_results(EVAL_RESULTS_DIR)
219
-
220
- if df.empty:
221
- return df, "No data available", "No data available", "No data available"
222
-
223
- # Get the column order from the dataframe
224
- cols = [c for c in df.columns if not c.startswith('_')]
225
-
226
- # Ensure all columns exist for the dataframe component
227
- for c in cols:
228
- if c not in df.columns:
229
- df[c] = None
230
-
231
- # Calculate stats
232
- total_models = len(df)
233
- best_per_val = df['Avg PER'].min() if 'Avg PER' in df.columns and not df['Avg PER'].isna().all() else "N/A"
234
- avg_duration_val = df['Avg Duration (s)'].mean() if 'Avg Duration (s)' in df.columns and not df['Avg Duration (s)'].isna().all() else "N/A"
235
-
236
- # Format stats
237
- best_per_str = f"{best_per_val:.2f}" if isinstance(best_per_val, (int, float)) else str(best_per_val)
238
- avg_duration_str = f"{avg_duration_val:.2f}s" if isinstance(avg_duration_val, (int, float)) else str(avg_duration_val)
239
-
240
- load_time = time.time() - start_time
241
-
242
- return (
243
- df[cols].round(3),
244
- f"<div class='stat-card'><div class='stat-value'>{total_models}</div><div class='stat-label'>Total Models</div></div>",
245
- f"<div class='stat-card'><div class='stat-value'>{best_per_str}</div><div class='stat-label'>Best PER</div></div>",
246
- f"<div class='stat-card'><div class='stat-value'>{avg_duration_str}</div><div class='stat-label'>Avg Duration</div></div>"
247
- )
248
-
249
- def export_csv_data():
250
- """Export data as CSV."""
251
- df = load_results(EVAL_RESULTS_DIR)
252
- if df.empty:
253
- return None
254
- cols = [c for c in df.columns if not c.startswith('_')]
255
- return df[cols].round(3)
256
-
257
- def export_json_data():
258
- """Export data as JSON."""
259
- df = load_results(EVAL_RESULTS_DIR)
260
- if df.empty:
261
- return None
262
- cols = [c for c in df.columns if not c.startswith('_')]
263
- return df[cols].round(3).to_json(orient='records', indent=2)
264
-
265
- # Connect events
266
- refresh_btn.click(
267
- fn=refresh,
268
- outputs=[table, total_models, best_per, avg_duration]
269
- )
270
-
271
- export_csv.click(
272
- fn=export_csv_data,
273
- outputs=gr.File(label="Download CSV")
274
- )
275
-
276
- export_json.click(
277
- fn=export_json_data,
278
- outputs=gr.File(label="Download JSON")
279
- )
280
-
281
- # Auto-load on start
282
- table.value, total_models.value, best_per.value, avg_duration.value = refresh()
283
-
284
- # Help section
285
- with gr.Accordion("ℹ️ About this Leaderboard", open=False):
286
- gr.Markdown("""
287
- ## 📊 Understanding the Results
288
-
289
- **Performance Metrics:**
290
- - **PER (Phoneme Error Rate)**: Lower values indicate better performance
291
- - **Avg Duration**: Processing time per sample (lower is faster)
292
- - **Models are ranked by average PER across all datasets**
293
-
294
- **Datasets Evaluated:**
295
- - `phoneme_asr`: General phoneme recognition dataset
296
- - `kids_phoneme_md`: Kids' phoneme recognition dataset
297
-
298
- **How to Interpret:**
299
- - **PER**: Percentage of phonemes incorrectly recognized (0% = perfect)
300
- - **Duration**: Time efficiency (important for real-time applications)
301
- - **Average PER**: Overall model performance across all datasets
302
-
303
- **Tips for Model Selection:**
304
- - Choose models with low PER for accuracy-critical applications
305
- - Consider duration for real-time or resource-constrained environments
306
- - Balance between accuracy (PER) and speed (Duration) based on your needs
307
- """)
308
-
309
- return demo
310
-
311
-
312
- if __name__ == "__main__":
313
- demo = build_interface()
314
- demo.queue().launch(
315
- server_name="0.0.0.0",
316
- server_port=7860,
317
- share=False
318
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
constants.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ # Directory where request by models are stored
4
+ DIR_OUTPUT_REQUESTS = Path("requested_models")
5
+ EVAL_REQUESTS_PATH = Path("eval_requests")
6
+
7
+ ##########################
8
+ # Text definitions #
9
+ ##########################
10
+
11
+ banner_url = "https://huggingface.co/datasets/reach-vb/random-images/resolve/main/phoneme_leaderboard.png"
12
+ BANNER = f'<div style="display: flex; justify-content: space-around;"><img src="{banner_url}" alt="Banner" style="width: 40vw; min-width: 300px; max-width: 600px;"> </div>'
13
+
14
+ TITLE = "<html> <head> <style> h1 {text-align: center;} </style> </head> <body> <h1> 🤗 Phoneme Detection Leaderboard </b> </body> </html>"
15
+
16
+ INTRODUCTION_TEXT = """📐 The 🤗 Phoneme Detection Leaderboard ranks and evaluates phoneme recognition models
17
+ on the Hugging Face Hub.
18
+ \nWe report the Average [PER](https://en.wikipedia.org/wiki/Phoneme_error_rate) (⬇️ lower the better) and Average Duration. Models are ranked based on their Average PER, from lowest to highest. Check the 📈 Metrics tab to understand how the models are evaluated.
19
+ \nIf you want results for a model that is not listed here, you can submit a request for it to be included ✉️✨.
20
+ \nThe leaderboard includes phoneme recognition evaluation across multiple datasets."""
21
+
22
+ CITATION_TEXT = """@misc{phoneme-detection-leaderboard,
23
+ title = {Phoneme Detection Leaderboard},
24
+ author = {Your Name and Contributors},
25
+ year = 2024,
26
+ publisher = {Hugging Face},
27
+ howpublished = "\\url{https://huggingface.co/spaces/your-org/phoneme-detection-leaderboard}"
28
+ }
29
+ """
30
+
31
+ METRICS_TAB_TEXT = """
32
+ Here you will find details about the phoneme recognition metrics and datasets reported in our leaderboard.
33
+
34
+ ## Metrics
35
+
36
+ Models are evaluated using the Phoneme Error Rate (PER) metric. The PER metric
37
+ is used to assess the accuracy of a phoneme recognition system. Models are ranked in the leaderboard based
38
+ on their PER, lowest to highest.
39
+
40
+ ### Phoneme Error Rate (PER)
41
+
42
+ Phoneme Error Rate is used to measure the **accuracy** of automatic phoneme recognition systems. It calculates the percentage
43
+ of phonemes in the system's output that differ from the reference (correct) phoneme sequence. **A lower PER value indicates higher accuracy**.
44
+
45
+ The PER is calculated using sequence alignment between predicted and reference phoneme sequences, taking into account:
46
+ - Substitutions (S): predicted phoneme differs from reference
47
+ - Deletions (D): reference phoneme missing in prediction
48
+ - Insertions (I): predicted phoneme not in reference
49
+
50
+ ```
51
+ PER = (S + D + I) / N * 100
52
+ ```
53
+
54
+ Where N is the total number of reference phonemes.
55
+
56
+ ## How to reproduce our results
57
+
58
+ The Phoneme Detection Leaderboard is an effort to benchmark open source phoneme recognition models.
59
+ Along with the Leaderboard we're open-sourcing the codebase used for running these evaluations.
60
+
61
+ P.S. We'd love to know which other models you'd like us to benchmark next. Contributions are more than welcome! ♥️
62
+
63
+ ## Benchmark datasets
64
+
65
+ Evaluating Phoneme Recognition systems requires diverse datasets with phonetic transcriptions. We use multiple datasets to obtain robust evaluation scores for each model.
66
+
67
+ | Dataset | Description | Language | License |
68
+ |---------|-------------|----------|---------|
69
+ | phoneme_asr | General phoneme recognition dataset | English | Open |
70
+ | kids_phoneme_md | Children's speech phoneme dataset | English | Open |
71
+
72
+ For more details on the individual datasets and how models are evaluated, refer to our documentation.
73
+ """
74
+
75
+ LEADERBOARD_CSS = """
76
+ #leaderboard-table th .header-content {
77
+ white-space: nowrap;
78
+ }
79
+
80
+ #phoneme-table th .header-content {
81
+ white-space: nowrap;
82
+ }
83
+
84
+ #phoneme-table th:hover {
85
+ background-color: var(--table-row-focus);
86
+ }
87
+ """
eval-results/{results_1759289565_HuBERT-Base.json → results_1759378937_HuBERT-Base.json} RENAMED
@@ -1,17 +1,17 @@
1
  {
2
  "config": {
3
- "model_name": "local/HuBERT-Base",
4
  "model_dtype": "float32",
5
  "model_sha": ""
6
  },
7
  "results": {
8
  "phoneme_asr": {
9
  "per": 79.85359813133437,
10
- "avg_duration": 0.5645037651062011
11
  },
12
  "kids_phoneme_md": {
13
  "per": 71.85295670319688,
14
- "avg_duration": 1.0543905973434449
15
  }
16
  }
17
  }
 
1
  {
2
  "config": {
3
+ "model_name": "HuBERT-Base",
4
  "model_dtype": "float32",
5
  "model_sha": ""
6
  },
7
  "results": {
8
  "phoneme_asr": {
9
  "per": 79.85359813133437,
10
+ "avg_duration": 0.7736877918243408
11
  },
12
  "kids_phoneme_md": {
13
  "per": 71.85295670319688,
14
+ "avg_duration": 1.47061448097229
15
  }
16
  }
17
  }
eval-results/{results_1759289565_HuBERT-fine-tuned.json → results_1759378937_HuBERT-fine-tuned.json} RENAMED
@@ -1,17 +1,17 @@
1
  {
2
  "config": {
3
- "model_name": "local/HuBERT-fine-tuned",
4
  "model_dtype": "float32",
5
  "model_sha": ""
6
  },
7
  "results": {
8
  "phoneme_asr": {
9
  "per": 2.774112645808511,
10
- "avg_duration": 0.5711040496826172
11
  },
12
  "kids_phoneme_md": {
13
  "per": 12.210125572986708,
14
- "avg_duration": 1.0601478815078735
15
  }
16
  }
17
  }
 
1
  {
2
  "config": {
3
+ "model_name": "HuBERT-fine-tuned",
4
  "model_dtype": "float32",
5
  "model_sha": ""
6
  },
7
  "results": {
8
  "phoneme_asr": {
9
  "per": 2.774112645808511,
10
+ "avg_duration": 0.7994948387145996
11
  },
12
  "kids_phoneme_md": {
13
  "per": 12.210125572986708,
14
+ "avg_duration": 1.439890170097351
15
  }
16
  }
17
  }
eval-results/{results_1759289565_Timit.json → results_1759378937_Timit.json} RENAMED
@@ -1,17 +1,17 @@
1
  {
2
  "config": {
3
- "model_name": "local/Timit",
4
  "model_dtype": "float32",
5
  "model_sha": ""
6
  },
7
  "results": {
8
  "phoneme_asr": {
9
  "per": 36.477283094931195,
10
- "avg_duration": 0.554583740234375
11
  },
12
  "kids_phoneme_md": {
13
  "per": 40.59831492610759,
14
- "avg_duration": 1.0818484544754028
15
  }
16
  }
17
  }
 
1
  {
2
  "config": {
3
+ "model_name": "Timit",
4
  "model_dtype": "float32",
5
  "model_sha": ""
6
  },
7
  "results": {
8
  "phoneme_asr": {
9
  "per": 36.477283094931195,
10
+ "avg_duration": 0.8033712863922119
11
  },
12
  "kids_phoneme_md": {
13
  "per": 40.59831492610759,
14
+ "avg_duration": 1.455029034614563
15
  }
16
  }
17
  }
eval-results/results_1759378937_Whisper.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "config": {
3
+ "model_name": "Whisper",
4
+ "model_dtype": "float32",
5
+ "model_sha": ""
6
+ },
7
+ "results": {
8
+ "phoneme_asr": {
9
+ "per": 80.66478307042628,
10
+ "avg_duration": 1.2233323097229003
11
+ },
12
+ "kids_phoneme_md": {
13
+ "per": 72.25186973830769,
14
+ "avg_duration": 1.3742226600646972
15
+ }
16
+ }
17
+ }
init.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from constants import EVAL_REQUESTS_PATH
3
+ from pathlib import Path
4
+ from huggingface_hub import HfApi, Repository
5
+
6
+ TOKEN_HUB = os.environ.get("TOKEN_HUB", None)
7
+ QUEUE_REPO = os.environ.get("QUEUE_REPO", None)
8
+ QUEUE_PATH = os.environ.get("QUEUE_PATH", None)
9
+
10
+ hf_api = HfApi(
11
+ endpoint="https://huggingface.co",
12
+ token=TOKEN_HUB,
13
+ )
14
+
15
+ def load_all_info_from_dataset_hub():
16
+ eval_queue_repo = None
17
+ requested_models = None
18
+
19
+ passed = True
20
+ if TOKEN_HUB is None:
21
+ passed = False
22
+ else:
23
+ print("Pulling evaluation requests and results.")
24
+
25
+ eval_queue_repo = Repository(
26
+ local_dir=QUEUE_PATH,
27
+ clone_from=QUEUE_REPO,
28
+ use_auth_token=TOKEN_HUB,
29
+ repo_type="dataset",
30
+ )
31
+ eval_queue_repo.git_pull()
32
+
33
+ # Local directory where dataset repo is cloned + folder with eval requests
34
+ directory = QUEUE_PATH / EVAL_REQUESTS_PATH
35
+ requested_models = get_all_requested_models(directory)
36
+ requested_models = [p.stem for p in requested_models]
37
+ # Local directory where dataset repo is cloned
38
+ csv_results = get_csv_with_results(QUEUE_PATH)
39
+ if csv_results is None:
40
+ passed = False
41
+ if not passed:
42
+ raise ValueError("No Hugging Face token provided. Skipping evaluation requests and results.")
43
+
44
+ return eval_queue_repo, requested_models, csv_results
45
+
46
+ def upload_file(requested_model_name, path_or_fileobj):
47
+ dest_repo_file = Path(EVAL_REQUESTS_PATH) / path_or_fileobj.name
48
+ dest_repo_file = str(dest_repo_file)
49
+ hf_api.upload_file(
50
+ path_or_fileobj=path_or_fileobj,
51
+ path_in_repo=str(dest_repo_file),
52
+ repo_id=QUEUE_REPO,
53
+ token=TOKEN_HUB,
54
+ repo_type="dataset",
55
+ commit_message=f"Add {requested_model_name} to eval queue")
56
+
57
+ def get_all_requested_models(directory):
58
+ directory = Path(directory)
59
+ all_requested_models = list(directory.glob("*.txt"))
60
+ return all_requested_models
61
+
62
+ def get_csv_with_results(directory):
63
+ directory = Path(directory)
64
+ all_csv_files = list(directory.glob("*.csv"))
65
+ latest = [f for f in all_csv_files if f.stem.endswith("latest")]
66
+ if len(latest) != 1:
67
+ return None
68
+ return latest[0]
69
+
70
+ def is_model_on_hub(model_name, revision="main"):
71
+ try:
72
+ model_name = model_name.replace(" ","")
73
+ author = model_name.split("/")[0]
74
+ model_id = model_name.split("/")[1]
75
+ if len(author) == 0 or len(model_id) == 0:
76
+ return False, "is not a valid model name. Please use the format `author/model_name`."
77
+ except Exception as e:
78
+ return False, "is not a valid model name. Please use the format `author/model_name`."
79
+
80
+ try:
81
+ models = list(hf_api.list_models(author=author, search=model_id))
82
+ matched = [model_name for m in models if m.modelId == model_name]
83
+ if len(matched) != 1:
84
+ return False, "was not found on the hub!"
85
+ else:
86
+ return True, None
87
+ except Exception as e:
88
+ print(f"Could not get the model from the hub.: {e}")
89
+ return False, "was not found on hub!"
src/phoneme_eval.py → phoneme_eval.py RENAMED
@@ -1,7 +1,7 @@
1
  import pandas as pd
2
- from src.utils.load_model import run_hubert_base, run_whisper, run_model, run_timit
3
- from src.utils.audio_process import calculate_error_rate, load_audio
4
- from src.utils.cmu_process import clean_cmu, cmu_to_ipa
5
 
6
  def set_output(model, pre_pho, ref_pho, duration, per, score):
7
  return {
@@ -16,7 +16,7 @@ def set_output(model, pre_pho, ref_pho, duration, per, score):
16
  # Map model names to their runner functions
17
  MODEL_RUNNERS = {
18
  "HuBERT-Base": run_hubert_base,
19
- # "Whisper": run_whisper,
20
  "HuBERT fine-tuned": run_model,
21
  "Timit": run_timit
22
  }
@@ -47,7 +47,7 @@ def benchmark_all(example):
47
  # Run all models
48
  results = [
49
  get_output("HuBERT-Base", wav, reference_phoneme),
50
- # get_output("Whisper", wav, reference_phoneme),
51
  get_output("HuBERT fine-tuned", wav, reference_phoneme),
52
  get_output("Timit", wav, reference_phoneme),
53
  ]
@@ -133,12 +133,13 @@ def main():
133
 
134
  # Save results for leaderboard consumption (one JSON per model)
135
  import json, os, time
136
- results_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "eval-results")
 
137
  os.makedirs(results_dir, exist_ok=True)
138
 
139
  timestamp = int(time.time())
140
  for model_name, task_results in per_model_results.items():
141
- org_model = f"local/{model_name}"
142
  payload = {
143
  "config": {
144
  "model_name": org_model,
 
1
  import pandas as pd
2
+ from utils.load_model import run_hubert_base, run_whisper, run_model, run_timit
3
+ from utils.audio_process import calculate_error_rate, load_audio
4
+ from utils.cmu_process import clean_cmu, cmu_to_ipa
5
 
6
  def set_output(model, pre_pho, ref_pho, duration, per, score):
7
  return {
 
16
  # Map model names to their runner functions
17
  MODEL_RUNNERS = {
18
  "HuBERT-Base": run_hubert_base,
19
+ "Whisper": run_whisper,
20
  "HuBERT fine-tuned": run_model,
21
  "Timit": run_timit
22
  }
 
47
  # Run all models
48
  results = [
49
  get_output("HuBERT-Base", wav, reference_phoneme),
50
+ get_output("Whisper", wav, reference_phoneme),
51
  get_output("HuBERT fine-tuned", wav, reference_phoneme),
52
  get_output("Timit", wav, reference_phoneme),
53
  ]
 
133
 
134
  # Save results for leaderboard consumption (one JSON per model)
135
  import json, os, time
136
+ # results_dir = os.path.join(os.path.dirname(os.path.dirname(__file__)), "eval-results")
137
+ results_dir = os.path.join("eval-results")
138
  os.makedirs(results_dir, exist_ok=True)
139
 
140
  timestamp = int(time.time())
141
  for model_name, task_results in per_model_results.items():
142
+ org_model = f"{model_name}"
143
  payload = {
144
  "config": {
145
  "model_name": org_model,
pyproject.toml DELETED
@@ -1,13 +0,0 @@
1
- [tool.ruff]
2
- # Enable pycodestyle (`E`) and Pyflakes (`F`) codes by default.
3
- select = ["E", "F"]
4
- ignore = ["E501"] # line too long (black is taking care of this)
5
- line-length = 119
6
- fixable = ["A", "B", "C", "D", "E", "F", "G", "I", "N", "Q", "S", "T", "W", "ANN", "ARG", "BLE", "COM", "DJ", "DTZ", "EM", "ERA", "EXE", "FBT", "ICN", "INP", "ISC", "NPY", "PD", "PGH", "PIE", "PL", "PT", "PTH", "PYI", "RET", "RSE", "RUF", "SIM", "SLF", "TCH", "TID", "TRY", "UP", "YTT"]
7
-
8
- [tool.isort]
9
- profile = "black"
10
- line_length = 119
11
-
12
- [tool.black]
13
- line-length = 119
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -1,21 +1,13 @@
1
- APScheduler
2
- black
3
- datasets
4
  gradio
5
- gradio[oauth]
6
- gradio_leaderboard==0.0.13
7
- gradio_client
8
- huggingface-hub>=0.18.0
9
- matplotlib
10
- numpy
11
  pandas
12
- python-dateutil
13
- tqdm
14
  transformers
15
- tokenizers>=0.15.0
16
- sentencepiece
17
  torch
 
 
 
 
 
18
  nltk
19
  g2p-en
20
- librosa
21
- soundfile
 
 
 
 
1
  gradio
 
 
 
 
 
 
2
  pandas
3
+ numpy
 
4
  transformers
 
 
5
  torch
6
+ torchaudio
7
+ datasets
8
+ huggingface-hub
9
+ soundfile
10
+ librosa
11
  nltk
12
  g2p-en
13
+ python-dotenv
 
src/about.py DELETED
@@ -1,74 +0,0 @@
1
- from dataclasses import dataclass
2
- from enum import Enum
3
-
4
- @dataclass
5
- class Task:
6
- benchmark: str
7
- metric: str
8
- col_name: str
9
-
10
-
11
- # Select your tasks here
12
- # ---------------------------------------------------
13
- class Tasks(Enum):
14
- # task_key in the results json, metric_key, column name for display
15
- # Using actual dataset names as keys
16
- phoneme_asr = Task("phoneme_asr", "per", "PER phoneme_asr")
17
- kids_phoneme_md = Task("kids_phoneme_md", "per", "PER kids_phoneme_md")
18
-
19
- NUM_FEWSHOT = 0 # Change with your few shot
20
- # ---------------------------------------------------
21
-
22
-
23
-
24
- # Your leaderboard name
25
- TITLE = """<h1 align="center" id="space-title">Phoneme Detection Leaderboard</h1>"""
26
-
27
- # What does your leaderboard evaluate?
28
- INTRODUCTION_TEXT = """
29
- This leaderboard ranks phoneme detection models by average PER (lower is better).
30
- Evaluations aggregate across phoneme_asr and kids_phoneme_md datasets for a fair comparison.
31
- """
32
-
33
- # Which evaluations are you running? how can people reproduce what you have?
34
- LLM_BENCHMARKS_TEXT = f"""
35
- ## How it works
36
- We compute Phoneme Error Rate (PER) per dataset/split and aggregate an average.
37
-
38
- ## Reproducibility
39
- Ensure your model and tokenizer can be loaded via Transformers AutoClasses.
40
- """
41
-
42
- EVALUATION_QUEUE_TEXT = """
43
- ## Some good practices before submitting a model
44
-
45
- ### 1) Make sure you can load your model and tokenizer using AutoClasses:
46
- ```python
47
- from transformers import AutoConfig, AutoModel, AutoTokenizer
48
- config = AutoConfig.from_pretrained("your model name", revision=revision)
49
- model = AutoModel.from_pretrained("your model name", revision=revision)
50
- tokenizer = AutoTokenizer.from_pretrained("your model name", revision=revision)
51
- ```
52
- If this step fails, follow the error messages to debug your model before submitting it. It's likely your model has been improperly uploaded.
53
-
54
- Note: make sure your model is public!
55
- Note: if your model needs `use_remote_code=True`, we do not support this option yet but we are working on adding it, stay posted!
56
-
57
- ### 2) Convert your model weights to [safetensors](https://huggingface.co/docs/safetensors/index)
58
- It's a new format for storing weights which is safer and faster to load and use. It will also allow us to add the number of parameters of your model to the `Extended Viewer`!
59
-
60
- ### 3) Make sure your model has an open license!
61
- This is a leaderboard for Open LLMs, and we'd love for as many people as possible to know they can use your model 🤗
62
-
63
- ### 4) Fill up your model card
64
- When we add extra information about models to the leaderboard, it will be automatically taken from the model card
65
-
66
- ## In case of model failure
67
- If your model is displayed in the `FAILED` category, its execution stopped.
68
- Make sure you have followed the above steps first.
69
- If everything is done, check you can launch the EleutherAIHarness on your model locally, using the above command without modifications (you can add `--limit` to limit the number of examples per task).
70
- """
71
-
72
- CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
73
- CITATION_BUTTON_TEXT = r"""
74
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/css_html_js.py DELETED
@@ -1,105 +0,0 @@
1
- custom_css = """
2
-
3
- .markdown-text {
4
- font-size: 16px !important;
5
- }
6
-
7
- #models-to-add-text {
8
- font-size: 18px !important;
9
- }
10
-
11
- #citation-button span {
12
- font-size: 16px !important;
13
- }
14
-
15
- #citation-button textarea {
16
- font-size: 16px !important;
17
- }
18
-
19
- #citation-button > label > button {
20
- margin: 6px;
21
- transform: scale(1.3);
22
- }
23
-
24
- #leaderboard-table {
25
- margin-top: 15px
26
- }
27
-
28
- #leaderboard-table-lite {
29
- margin-top: 15px
30
- }
31
-
32
- #search-bar-table-box > div:first-child {
33
- background: none;
34
- border: none;
35
- }
36
-
37
- #search-bar {
38
- padding: 0px;
39
- }
40
-
41
- /* Limit the width of the first AutoEvalColumn so that names don't expand too much */
42
- #leaderboard-table td:nth-child(2),
43
- #leaderboard-table th:nth-child(2) {
44
- max-width: 400px;
45
- overflow: auto;
46
- white-space: nowrap;
47
- }
48
-
49
- .tab-buttons button {
50
- font-size: 20px;
51
- }
52
-
53
- #scale-logo {
54
- border-style: none !important;
55
- box-shadow: none;
56
- display: block;
57
- margin-left: auto;
58
- margin-right: auto;
59
- max-width: 600px;
60
- }
61
-
62
- #scale-logo .download {
63
- display: none;
64
- }
65
- #filter_type{
66
- border: 0;
67
- padding-left: 0;
68
- padding-top: 0;
69
- }
70
- #filter_type label {
71
- display: flex;
72
- }
73
- #filter_type label > span{
74
- margin-top: var(--spacing-lg);
75
- margin-right: 0.5em;
76
- }
77
- #filter_type label > .wrap{
78
- width: 103px;
79
- }
80
- #filter_type label > .wrap .wrap-inner{
81
- padding: 2px;
82
- }
83
- #filter_type label > .wrap .wrap-inner input{
84
- width: 1px
85
- }
86
- #filter-columns-type{
87
- border:0;
88
- padding:0.5;
89
- }
90
- #filter-columns-size{
91
- border:0;
92
- padding:0.5;
93
- }
94
- #box-filter > .form{
95
- border: 0
96
- }
97
- """
98
-
99
- get_window_url_params = """
100
- function(url_params) {
101
- const params = new URLSearchParams(window.location.search);
102
- url_params = Object.fromEntries(params);
103
- return url_params;
104
- }
105
- """
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/formatting.py DELETED
@@ -1,27 +0,0 @@
1
- def model_hyperlink(link, model_name):
2
- return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
3
-
4
-
5
- def make_clickable_model(model_name):
6
- link = f"https://huggingface.co/{model_name}"
7
- return model_hyperlink(link, model_name)
8
-
9
-
10
- def styled_error(error):
11
- return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
12
-
13
-
14
- def styled_warning(warn):
15
- return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
16
-
17
-
18
- def styled_message(message):
19
- return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
20
-
21
-
22
- def has_no_nan_values(df, columns):
23
- return df[columns].notna().all(axis=1)
24
-
25
-
26
- def has_nan_values(df, columns):
27
- return df[columns].isna().any(axis=1)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/display/utils.py DELETED
@@ -1,72 +0,0 @@
1
- from dataclasses import dataclass, make_dataclass
2
- from enum import Enum
3
- import pandas as pd
4
-
5
- from src.about import Tasks # assume Tasks = [Task1, Task2, ...]
6
-
7
- def fields(raw_class):
8
- return [
9
- v for k, v in raw_class.__dict__.items()
10
- if not (k.startswith("__") and k.endswith("__"))
11
- ]
12
-
13
- @dataclass
14
- class ColumnContent:
15
- name: str
16
- type: str
17
- displayed_by_default: bool
18
- hidden: bool = False
19
- never_hidden: bool = False
20
-
21
- # -------------------------------------------------------------------
22
- # Build leaderboard columns
23
- # -------------------------------------------------------------------
24
- auto_eval_column_dict = []
25
-
26
- # Rank/Model/Badge
27
- auto_eval_column_dict.append(["rank", ColumnContent, ColumnContent("Rank", "number", True, never_hidden=True)])
28
- auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)])
29
- auto_eval_column_dict.append(["badge", ColumnContent, ColumnContent("Badge", "str", True)])
30
-
31
- # Per-dataset metrics
32
- # Example: "PER ⬇️ (TIMIT)", "Avg Duration (s) (TIMIT)"
33
- for task in Tasks:
34
- dataset_name = task.name # short name
35
- col_base = task.value.col_name # e.g. "PER ⬇️"
36
- # allow multiple metrics per dataset if needed
37
- auto_eval_column_dict.append([
38
- f"{dataset_name}_per",
39
- ColumnContent,
40
- ColumnContent(f"{col_base} ({dataset_name})", "number", True),
41
- ])
42
- auto_eval_column_dict.append([
43
- f"{dataset_name}_avg_duration",
44
- ColumnContent,
45
- ColumnContent(f"Avg Duration (s) ({dataset_name})", "number", True),
46
- ])
47
-
48
- # Global average across datasets
49
- auto_eval_column_dict.append([
50
- "average", ColumnContent, ColumnContent("Avg PER ⬇️ (All)", "number", True)
51
- ])
52
-
53
- # Extra model info
54
- auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
55
- auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
56
- auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
57
- auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
58
- auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
59
- auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
60
- auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
61
- auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
62
- auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
63
-
64
- # Final dataclass
65
- AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
66
-
67
- # -------------------------------------------------------------------
68
- # Example: Create dataframe header
69
- # -------------------------------------------------------------------
70
- COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
71
-
72
- df = pd.DataFrame(columns=[c.name for c in fields(AutoEvalColumn)])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/envs.py DELETED
@@ -1,25 +0,0 @@
1
- import os
2
-
3
- from huggingface_hub import HfApi
4
-
5
- # Info to change for your repository
6
- # ----------------------------------
7
- TOKEN = os.environ.get("HF_TOKEN") # A read/write token for your org
8
-
9
- OWNER = "demo-leaderboard-backend" # Change to your org - don't forget to create a results and request dataset, with the correct format!
10
- # ----------------------------------
11
-
12
- REPO_ID = f"{OWNER}/leaderboard"
13
- QUEUE_REPO = f"{OWNER}/requests"
14
- RESULTS_REPO = f"{OWNER}/results"
15
-
16
- # If you setup a cache later, just change HF_HOME
17
- CACHE_PATH=os.getenv("HF_HOME", ".")
18
-
19
- # Local caches
20
- EVAL_REQUESTS_PATH = os.path.join(CACHE_PATH, "eval-queue")
21
- EVAL_RESULTS_PATH = os.path.join(CACHE_PATH, "eval-results")
22
- EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
23
- EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
24
-
25
- API = HfApi(token=TOKEN)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/leaderboard/read_evals.py DELETED
@@ -1,207 +0,0 @@
1
- import glob
2
- import json
3
- import math
4
- import os
5
- from dataclasses import dataclass
6
-
7
- import dateutil
8
- import numpy as np
9
-
10
- from src.display.formatting import make_clickable_model
11
- from src.display.utils import AutoEvalColumn
12
- from src.about import Tasks
13
- from src.submission.check_validity import is_model_on_hub
14
-
15
-
16
- @dataclass
17
- class EvalResult:
18
- """Represents one full evaluation. Built from a combination of the result and request file for a given run.
19
- """
20
- eval_name: str # org_model_precision (uid)
21
- full_model: str # org/model (path on hub)
22
- org: str
23
- model: str
24
- revision: str # commit hash, "" if main
25
- results: dict
26
- precision: str = "Unknown"
27
- model_type: str = "Unknown" # Pretrained, fine tuned, ...
28
- weight_type: str = "Original" # Original or Adapter
29
- architecture: str = "Unknown"
30
- license: str = "?"
31
- likes: int = 0
32
- num_params: int = 0
33
- date: str = "" # submission date of request file
34
- still_on_hub: bool = False
35
-
36
- @classmethod
37
- def init_from_json_file(self, json_filepath):
38
- """Inits the result from the specific model result file"""
39
- with open(json_filepath) as fp:
40
- data = json.load(fp)
41
-
42
- config = data.get("config")
43
-
44
- # Precision
45
- precision = str(config.get("model_dtype", "Unknown"))
46
-
47
- # Get model and org
48
- org_and_model = config.get("model_name", config.get("model_args", None))
49
- org_and_model = org_and_model.split("/", 1)
50
-
51
- if len(org_and_model) == 1:
52
- org = None
53
- model = org_and_model[0]
54
- result_key = f"{model}_{precision}"
55
- else:
56
- org = org_and_model[0]
57
- model = org_and_model[1]
58
- result_key = f"{org}_{model}_{precision}"
59
- full_model = "/".join(org_and_model)
60
-
61
- still_on_hub, _, model_config = is_model_on_hub(
62
- full_model, config.get("model_sha", "main"), trust_remote_code=True, test_tokenizer=False
63
- )
64
- architecture = "?"
65
- if model_config is not None:
66
- architectures = getattr(model_config, "architectures", None)
67
- if architectures:
68
- architecture = ";".join(architectures)
69
-
70
- # Extract results available in this file (some results are split in several files)
71
- results = {}
72
- for task in Tasks:
73
- task = task.value
74
-
75
- # We average all scores of a given metric (not all metrics are present in all files)
76
- per_vals = np.array([v.get(task.metric, None) for k, v in data["results"].items() if task.benchmark == k])
77
- if per_vals.size > 0 and not any([val is None for val in per_vals]):
78
- results[f"{task.benchmark}_per"] = float(np.mean(per_vals))
79
-
80
- # Average duration if present
81
- dur_vals = np.array([v.get("avg_duration", None) for k, v in data["results"].items() if task.benchmark == k])
82
- if dur_vals.size > 0 and not any([val is None for val in dur_vals]):
83
- results[f"{task.benchmark}_avg_duration"] = float(np.mean(dur_vals))
84
-
85
- return self(
86
- eval_name=result_key,
87
- full_model=full_model,
88
- org=org,
89
- model=model,
90
- results=results,
91
- precision=precision,
92
- revision= config.get("model_sha", ""),
93
- still_on_hub=still_on_hub,
94
- architecture=architecture
95
- )
96
-
97
- def update_with_request_file(self, requests_path):
98
- """Finds the relevant request file for the current model and updates info with it"""
99
- request_file = get_request_file_for_model(requests_path, self.full_model, self.precision)
100
-
101
- try:
102
- with open(request_file, "r") as f:
103
- request = json.load(f)
104
- self.model_type = str(request.get("model_type", "Unknown"))
105
- self.weight_type = str(request.get("weight_type", "Original"))
106
- self.license = request.get("license", "?")
107
- self.likes = request.get("likes", 0)
108
- self.num_params = request.get("params", 0)
109
- self.date = request.get("submitted_time", "")
110
- except Exception:
111
- print(f"Could not find request file for {self.org}/{self.model} with precision {self.precision}")
112
-
113
- def to_dict(self):
114
- """Converts the Eval Result to a dict compatible with our dataframe display"""
115
- # Compute average PER across tasks from per-keys only
116
- per_values = [v for k, v in self.results.items() if k.endswith("_per") and v is not None]
117
- average = sum(per_values) / len(per_values) if per_values else None
118
- data_dict = {
119
- AutoEvalColumn.rank.name: None,
120
- AutoEvalColumn.badge.name: "",
121
- "eval_name": self.eval_name, # not a column, just a save name,
122
- AutoEvalColumn.precision.name: self.precision,
123
- AutoEvalColumn.model_type.name: self.model_type,
124
- AutoEvalColumn.weight_type.name: self.weight_type,
125
- AutoEvalColumn.architecture.name: self.architecture,
126
- AutoEvalColumn.model.name: make_clickable_model(self.full_model),
127
- AutoEvalColumn.revision.name: self.revision,
128
- AutoEvalColumn.average.name: average,
129
- AutoEvalColumn.license.name: self.license,
130
- AutoEvalColumn.likes.name: self.likes,
131
- AutoEvalColumn.params.name: self.num_params,
132
- AutoEvalColumn.still_on_hub.name: self.still_on_hub,
133
- }
134
-
135
- for task in Tasks:
136
- dataset = task.name
137
- # Use display labels matching utils.AutoEvalColumn definitions
138
- per_label = f"{task.value.col_name} ({dataset})"
139
- dur_label = f"Avg Duration (s) ({dataset})"
140
- data_dict[per_label] = self.results.get(f"{task.value.benchmark}_per")
141
- data_dict[dur_label] = self.results.get(f"{task.value.benchmark}_avg_duration")
142
-
143
- return data_dict
144
-
145
-
146
- def get_request_file_for_model(requests_path, model_name, precision):
147
- """Selects the correct request file for a given model. Only keeps runs tagged as FINISHED"""
148
- request_files = os.path.join(
149
- requests_path,
150
- f"{model_name}_eval_request_*.json",
151
- )
152
- request_files = glob.glob(request_files)
153
-
154
- # Select correct request file (precision)
155
- request_file = ""
156
- request_files = sorted(request_files, reverse=True)
157
- for tmp_request_file in request_files:
158
- with open(tmp_request_file, "r") as f:
159
- req_content = json.load(f)
160
- if (
161
- req_content["status"] in ["FINISHED"]
162
- and req_content["precision"] == precision.split(".")[-1]
163
- ):
164
- request_file = tmp_request_file
165
- return request_file
166
-
167
-
168
- def get_raw_eval_results(results_path: str, requests_path: str) -> list[EvalResult]:
169
- """From the path of the results folder root, extract all needed info for results"""
170
- model_result_filepaths = []
171
-
172
- for root, _, files in os.walk(results_path):
173
- # We should only have json files in model results
174
- if len(files) == 0 or any([not f.endswith(".json") for f in files]):
175
- continue
176
-
177
- # Sort the files by date
178
- try:
179
- files.sort(key=lambda x: x.removesuffix(".json").removeprefix("results_")[:-7])
180
- except dateutil.parser._parser.ParserError:
181
- files = [files[-1]]
182
-
183
- for file in files:
184
- model_result_filepaths.append(os.path.join(root, file))
185
-
186
- eval_results = {}
187
- for model_result_filepath in model_result_filepaths:
188
- # Creation of result
189
- eval_result = EvalResult.init_from_json_file(model_result_filepath)
190
- eval_result.update_with_request_file(requests_path)
191
-
192
- # Store results of same eval together
193
- eval_name = eval_result.eval_name
194
- if eval_name in eval_results.keys():
195
- eval_results[eval_name].results.update({k: v for k, v in eval_result.results.items() if v is not None})
196
- else:
197
- eval_results[eval_name] = eval_result
198
-
199
- results = []
200
- for v in eval_results.values():
201
- try:
202
- v.to_dict() # we test if the dict version is complete
203
- results.append(v)
204
- except KeyError: # not all eval values present
205
- continue
206
-
207
- return results
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/populate.py DELETED
@@ -1,63 +0,0 @@
1
- import json
2
- import os
3
-
4
- import pandas as pd
5
-
6
- from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results
9
-
10
-
11
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
12
- """Creates a dataframe from all the individual experiment results"""
13
- raw_data = get_raw_eval_results(results_path, requests_path)
14
- all_data_json = [v.to_dict() for v in raw_data]
15
-
16
- df = pd.DataFrame.from_records(all_data_json)
17
- # If no data yet, return an empty DataFrame with expected columns
18
- if df.empty or AutoEvalColumn.average.name not in df.columns:
19
- return pd.DataFrame(columns=cols)
20
-
21
- # Lower PER is better: sort ascending
22
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=True)
23
- df = df[cols].round(decimals=2)
24
-
25
- # filter out if any of the benchmarks have not been produced
26
- df = df[has_no_nan_values(df, benchmark_cols)]
27
- return df
28
-
29
-
30
- def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
31
- """Creates the different dataframes for the evaluation queues requestes"""
32
- entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
33
- all_evals = []
34
-
35
- for entry in entries:
36
- if ".json" in entry:
37
- file_path = os.path.join(save_path, entry)
38
- with open(file_path) as fp:
39
- data = json.load(fp)
40
-
41
- data["Model"] = make_clickable_model(data["model"])
42
- data["Model sha"] = data.get("revision", "main")
43
-
44
- all_evals.append(data)
45
- elif ".md" not in entry:
46
- # this is a folder
47
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if os.path.isfile(e) and not e.startswith(".")]
48
- for sub_entry in sub_entries:
49
- file_path = os.path.join(save_path, entry, sub_entry)
50
- with open(file_path) as fp:
51
- data = json.load(fp)
52
-
53
- data["Model"] = make_clickable_model(data["model"])
54
- data["Model sha"] = data.get("revision", "main")
55
- all_evals.append(data)
56
-
57
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
58
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
59
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
60
- df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
61
- df_running = pd.DataFrame.from_records(running_list, columns=cols)
62
- df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
63
- return df_finished[cols], df_running[cols], df_pending[cols]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/check_validity.py DELETED
@@ -1,99 +0,0 @@
1
- import json
2
- import os
3
- import re
4
- from collections import defaultdict
5
- from datetime import datetime, timedelta, timezone
6
-
7
- import huggingface_hub
8
- from huggingface_hub import ModelCard
9
- from huggingface_hub.hf_api import ModelInfo
10
- from transformers import AutoConfig
11
- from transformers.models.auto.tokenization_auto import AutoTokenizer
12
-
13
- def check_model_card(repo_id: str) -> tuple[bool, str]:
14
- """Checks if the model card and license exist and have been filled"""
15
- try:
16
- card = ModelCard.load(repo_id)
17
- except huggingface_hub.utils.EntryNotFoundError:
18
- return False, "Please add a model card to your model to explain how you trained/fine-tuned it."
19
-
20
- # Enforce license metadata
21
- if card.data.license is None:
22
- if not ("license_name" in card.data and "license_link" in card.data):
23
- return False, (
24
- "License not found. Please add a license to your model card using the `license` metadata or a"
25
- " `license_name`/`license_link` pair."
26
- )
27
-
28
- # Enforce card content
29
- if len(card.text) < 200:
30
- return False, "Please add a description to your model card, it is too short."
31
-
32
- return True, ""
33
-
34
- def is_model_on_hub(model_name: str, revision: str, token: str = None, trust_remote_code=False, test_tokenizer=False) -> tuple[bool, str]:
35
- """Checks if the model model_name is on the hub, and whether it (and its tokenizer) can be loaded with AutoClasses."""
36
- try:
37
- config = AutoConfig.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
38
- if test_tokenizer:
39
- try:
40
- tk = AutoTokenizer.from_pretrained(model_name, revision=revision, trust_remote_code=trust_remote_code, token=token)
41
- except ValueError as e:
42
- return (
43
- False,
44
- f"uses a tokenizer which is not in a transformers release: {e}",
45
- None
46
- )
47
- except Exception as e:
48
- return (False, "'s tokenizer cannot be loaded. Is your tokenizer class in a stable transformers release, and correctly configured?", None)
49
- return True, None, config
50
-
51
- except ValueError:
52
- return (
53
- False,
54
- "needs to be launched with `trust_remote_code=True`. For safety reason, we do not allow these models to be automatically submitted to the leaderboard.",
55
- None
56
- )
57
-
58
- except Exception as e:
59
- return False, "was not found on hub!", None
60
-
61
-
62
- def get_model_size(model_info: ModelInfo, precision: str):
63
- """Gets the model size from the configuration, or the model name if the configuration does not contain the information."""
64
- try:
65
- model_size = round(model_info.safetensors["total"] / 1e9, 3)
66
- except (AttributeError, TypeError):
67
- return 0 # Unknown model sizes are indicated as 0, see NUMERIC_INTERVALS in app.py
68
-
69
- size_factor = 8 if (precision == "GPTQ" or "gptq" in model_info.modelId.lower()) else 1
70
- model_size = size_factor * model_size
71
- return model_size
72
-
73
- def get_model_arch(model_info: ModelInfo):
74
- """Gets the model architecture from the configuration"""
75
- return model_info.config.get("architectures", "Unknown")
76
-
77
- def already_submitted_models(requested_models_dir: str) -> set[str]:
78
- """Gather a list of already submitted models to avoid duplicates"""
79
- depth = 1
80
- file_names = []
81
- users_to_submission_dates = defaultdict(list)
82
-
83
- for root, _, files in os.walk(requested_models_dir):
84
- current_depth = root.count(os.sep) - requested_models_dir.count(os.sep)
85
- if current_depth == depth:
86
- for file in files:
87
- if not file.endswith(".json"):
88
- continue
89
- with open(os.path.join(root, file), "r") as f:
90
- info = json.load(f)
91
- file_names.append(f"{info['model']}_{info['revision']}_{info['precision']}")
92
-
93
- # Select organisation
94
- if info["model"].count("/") == 0 or "submitted_time" not in info:
95
- continue
96
- organisation, _ = info["model"].split("/")
97
- users_to_submission_dates[organisation].append(info["submitted_time"])
98
-
99
- return set(file_names), users_to_submission_dates
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/submission/submit.py DELETED
@@ -1,119 +0,0 @@
1
- import json
2
- import os
3
- from datetime import datetime, timezone
4
-
5
- from src.display.formatting import styled_error, styled_message, styled_warning
6
- from src.envs import API, EVAL_REQUESTS_PATH, TOKEN, QUEUE_REPO
7
- from src.submission.check_validity import (
8
- already_submitted_models,
9
- check_model_card,
10
- get_model_size,
11
- is_model_on_hub,
12
- )
13
-
14
- REQUESTED_MODELS = None
15
- USERS_TO_SUBMISSION_DATES = None
16
-
17
- def add_new_eval(
18
- model: str,
19
- base_model: str,
20
- revision: str,
21
- precision: str,
22
- weight_type: str,
23
- model_type: str,
24
- ):
25
- global REQUESTED_MODELS
26
- global USERS_TO_SUBMISSION_DATES
27
- if not REQUESTED_MODELS:
28
- REQUESTED_MODELS, USERS_TO_SUBMISSION_DATES = already_submitted_models(EVAL_REQUESTS_PATH)
29
-
30
- user_name = ""
31
- model_path = model
32
- if "/" in model:
33
- user_name = model.split("/")[0]
34
- model_path = model.split("/")[1]
35
-
36
- precision = precision.split(" ")[0]
37
- current_time = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
38
-
39
- if model_type is None or model_type == "":
40
- return styled_error("Please select a model type.")
41
-
42
- # Does the model actually exist?
43
- if revision == "":
44
- revision = "main"
45
-
46
- # Is the model on the hub?
47
- if weight_type in ["Delta", "Adapter"]:
48
- base_model_on_hub, error, _ = is_model_on_hub(model_name=base_model, revision=revision, token=TOKEN, test_tokenizer=True)
49
- if not base_model_on_hub:
50
- return styled_error(f'Base model "{base_model}" {error}')
51
-
52
- if not weight_type == "Adapter":
53
- model_on_hub, error, _ = is_model_on_hub(model_name=model, revision=revision, token=TOKEN, test_tokenizer=True)
54
- if not model_on_hub:
55
- return styled_error(f'Model "{model}" {error}')
56
-
57
- # Is the model info correctly filled?
58
- try:
59
- model_info = API.model_info(repo_id=model, revision=revision)
60
- except Exception:
61
- return styled_error("Could not get your model information. Please fill it up properly.")
62
-
63
- model_size = get_model_size(model_info=model_info, precision=precision)
64
-
65
- # Were the model card and license filled?
66
- try:
67
- license = model_info.cardData["license"]
68
- except Exception:
69
- return styled_error("Please select a license for your model")
70
-
71
- modelcard_OK, error_msg = check_model_card(model)
72
- if not modelcard_OK:
73
- return styled_error(error_msg)
74
-
75
- # Seems good, creating the eval
76
- print("Adding new eval")
77
-
78
- eval_entry = {
79
- "model": model,
80
- "base_model": base_model,
81
- "revision": revision,
82
- "precision": precision,
83
- "weight_type": weight_type,
84
- "status": "PENDING",
85
- "submitted_time": current_time,
86
- "model_type": model_type,
87
- "likes": model_info.likes,
88
- "params": model_size,
89
- "license": license,
90
- "private": False,
91
- }
92
-
93
- # Check for duplicate submission
94
- if f"{model}_{revision}_{precision}" in REQUESTED_MODELS:
95
- return styled_warning("This model has been already submitted.")
96
-
97
- print("Creating eval file")
98
- OUT_DIR = f"{EVAL_REQUESTS_PATH}/{user_name}"
99
- os.makedirs(OUT_DIR, exist_ok=True)
100
- out_path = f"{OUT_DIR}/{model_path}_eval_request_False_{precision}_{weight_type}.json"
101
-
102
- with open(out_path, "w") as f:
103
- f.write(json.dumps(eval_entry))
104
-
105
- print("Uploading eval file")
106
- API.upload_file(
107
- path_or_fileobj=out_path,
108
- path_in_repo=out_path.split("eval-queue/")[1],
109
- repo_id=QUEUE_REPO,
110
- repo_type="dataset",
111
- commit_message=f"Add {model} to eval queue",
112
- )
113
-
114
- # Remove the local file
115
- os.remove(out_path)
116
-
117
- return styled_message(
118
- "Your request has been submitted to the evaluation queue!\nPlease wait for up to an hour for the model to show in the PENDING list."
119
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
test_basic.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Basic test to verify the cleaned up phoneme detection leaderboard functionality.
4
+ """
5
+
6
+ import os
7
+ import sys
8
+ import json
9
+ import tempfile
10
+ import pandas as pd
11
+
12
+ # Add current directory to path
13
+ sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
14
+
15
+ def test_imports():
16
+ """Test that all modules can be imported"""
17
+ try:
18
+ from constants import BANNER, INTRODUCTION_TEXT
19
+ from utils_display import PhonemeEvalColumn, make_clickable_model
20
+ from init import is_model_on_hub
21
+ print("All imports successful")
22
+ return True
23
+ except ImportError as e:
24
+ print(f"Import error: {e}")
25
+ return False
26
+
27
+ def test_data_loading():
28
+ """Test that the app can load data from eval-results directory"""
29
+ try:
30
+ from app import load_results, EVAL_RESULTS_DIR
31
+
32
+ # Create a temporary test result
33
+ os.makedirs(EVAL_RESULTS_DIR, exist_ok=True)
34
+ test_result = {
35
+ "config": {
36
+ "model_name": "test/model",
37
+ "model_dtype": "float32",
38
+ "model_sha": "test123"
39
+ },
40
+ "results": {
41
+ "phoneme_asr": {"per": 15.5, "avg_duration": 0.1},
42
+ "kids_phoneme_md": {"per": 18.2, "avg_duration": 0.12}
43
+ }
44
+ }
45
+
46
+ test_file = os.path.join(EVAL_RESULTS_DIR, "test_results.json")
47
+ with open(test_file, "w") as f:
48
+ json.dump(test_result, f)
49
+
50
+ # Test loading
51
+ df = load_results(EVAL_RESULTS_DIR)
52
+ print(f"Data loading successful, found {len(df)} rows")
53
+
54
+ # Clean up
55
+ os.remove(test_file)
56
+ return True
57
+
58
+ except Exception as e:
59
+ print(f"Data loading error: {e}")
60
+ return False
61
+
62
+ def test_utils():
63
+ """Test utility functions"""
64
+ try:
65
+ from utils_display import make_clickable_model, styled_error, styled_message
66
+
67
+ # Test model link generation
68
+ link = make_clickable_model("facebook/hubert-base")
69
+ assert "facebook/hubert-base" in link
70
+ assert "href=" in link
71
+
72
+ # Test styled messages
73
+ error_msg = styled_error("Test error")
74
+ assert "red" in error_msg
75
+
76
+ success_msg = styled_message("Test success")
77
+ assert "green" in success_msg
78
+
79
+ print("Utility functions working")
80
+ return True
81
+
82
+ except Exception as e:
83
+ print(f"Utility test error: {e}")
84
+ return False
85
+
86
+ def main():
87
+ """Run all tests"""
88
+ print("Testing Phoneme Detection Leaderboard...")
89
+
90
+ tests = [
91
+ test_imports,
92
+ test_data_loading,
93
+ test_utils
94
+ ]
95
+
96
+ passed = 0
97
+ total = len(tests)
98
+
99
+ for test in tests:
100
+ if test():
101
+ passed += 1
102
+ print()
103
+
104
+ print(f"Test Results: {passed}/{total} tests passed")
105
+
106
+ if passed == total:
107
+ print("All tests passed! The cleaned up version is working correctly.")
108
+ return True
109
+ else:
110
+ print("Some tests failed. Please check the errors above.")
111
+ return False
112
+
113
+ if __name__ == "__main__":
114
+ success = main()
115
+ sys.exit(0 if success else 1)
utils/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ # Utils package for phoneme detection leaderboard
{src/utils → utils}/audio_process.py RENAMED
@@ -164,4 +164,4 @@ def calculate_error_rate(ref_seq, hyp_seq, unit="phoneme"):
164
  N = len(ref_seq) # reference length
165
  error_rate = (S + D + I) / N if N > 0 else 0.0
166
 
167
- return error_rate*100, {"S": S, "D": D, "I": I, "N": N}
 
164
  N = len(ref_seq) # reference length
165
  error_rate = (S + D + I) / N if N > 0 else 0.0
166
 
167
+ return error_rate*100, {"S": S, "D": D, "I": I, "N": N}
{src/utils → utils}/cmu_process.py RENAMED
@@ -108,4 +108,4 @@ def text_to_phoneme(text):
108
  phonemes = safe_g2p(clean_text(text))
109
  res = "".join(phonemes)
110
  res = clean_cmu(res)
111
- return res
 
108
  phonemes = safe_g2p(clean_text(text))
109
  res = "".join(phonemes)
110
  res = clean_cmu(res)
111
+ return res
{src/utils → utils}/load_model.py RENAMED
@@ -39,6 +39,7 @@ whisper_model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-
39
 
40
  # 3. My Hubert Model (optional HF token via env)
41
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
 
42
  proc = Wav2Vec2Processor.from_pretrained("tecasoftai/hubert-finetune", token=HF_TOKEN)
43
  model = HubertForCTC.from_pretrained("tecasoftai/hubert-finetune", token=HF_TOKEN).to(device).eval()
44
 
@@ -114,4 +115,4 @@ def run_timit(wav):
114
  phonemes = timit_proc.batch_decode(predicted_ids)
115
  phonemes = "".join(phonemes)
116
 
117
- return phonemes.strip(), time.time() - start
 
39
 
40
  # 3. My Hubert Model (optional HF token via env)
41
  HF_TOKEN = os.environ.get("HF_TOKEN", None)
42
+ # print(HF_TOKEN)
43
  proc = Wav2Vec2Processor.from_pretrained("tecasoftai/hubert-finetune", token=HF_TOKEN)
44
  model = HubertForCTC.from_pretrained("tecasoftai/hubert-finetune", token=HF_TOKEN).to(device).eval()
45
 
 
115
  phonemes = timit_proc.batch_decode(predicted_ids)
116
  phonemes = "".join(phonemes)
117
 
118
+ return phonemes.strip(), time.time() - start
utils_display.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ # These classes are for user facing column names, to avoid having to change them
4
+ # all around the code when a modif is needed
5
+ @dataclass
6
+ class ColumnContent:
7
+ name: str
8
+ type: str
9
+
10
+ def fields(raw_class):
11
+ return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
12
+
13
+ @dataclass(frozen=True)
14
+ class PhonemeEvalColumn: # Phoneme evals column
15
+ model = ColumnContent("Model", "markdown")
16
+ avg_per = ColumnContent("Average PER ⬇️", "number")
17
+ avg_duration = ColumnContent("Avg Duration (s)", "number")
18
+ per_phoneme_asr = ColumnContent("PER phoneme_asr", "number")
19
+ per_kids_phoneme_md = ColumnContent("PER kids_phoneme_md", "number")
20
+
21
+ def make_clickable_model(model_name):
22
+ model_name_list = model_name.split("/")
23
+ if model_name_list[0] == "local":
24
+ link = "#" # Local models don't have external links
25
+ elif model_name_list[0] == "facebook":
26
+ link = f"https://huggingface.co/{model_name}"
27
+ elif model_name_list[0] == "openai":
28
+ link = "https://openai.com/"
29
+ elif model_name_list[0] == "HuBERT-Base":
30
+ link = "https://huggingface.co/facebook/hubert-base-ls960"
31
+ elif model_name_list[0] == "HuBERT-fine-tuned":
32
+ link = "https://huggingface.co/tecasoftai/hubert-finetune"
33
+ elif model_name_list[0] == "Timit":
34
+ link = "https://huggingface.co/vitouphy/wav2vec2-xls-r-300m-timit-phoneme"
35
+ elif model_name_list[0] == "Whisper":
36
+ link = "https://huggingface.co/openai/whisper-base"
37
+ else:
38
+ link = f"https://huggingface.co/{model_name}"
39
+ return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
40
+
41
+ def styled_error(error):
42
+ return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
43
+
44
+ def styled_warning(warn):
45
+ return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
46
+
47
+ def styled_message(message):
48
+ return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"