Commit
·
e4f522a
1
Parent(s):
61cd432
optinization of the leaderboard
Browse files- app.py +74 -42
- data/summary_data.csv +6 -6
app.py
CHANGED
|
@@ -42,15 +42,15 @@ df_avg_latency = load_data(AVG_LATENCY_FILE)
|
|
| 42 |
df_p99_latency = load_data(P99_LATENCY_FILE)
|
| 43 |
print("Data loading complete.")
|
| 44 |
|
| 45 |
-
# --- *** NEW: Convert Costs to Cents *** ---
|
| 46 |
COST_COLUMN_SUMMARY = 'Costs (USD)' # IMPORTANT: Check this matches your summary_data.csv header EXACTLY
|
| 47 |
-
NEW_COST_COLUMN_SUMMARY = 'Avg Cost (Cents)' # This is the new name we'll use
|
| 48 |
|
| 49 |
# Convert summary cost
|
| 50 |
if not df_summary.empty and COST_COLUMN_SUMMARY in df_summary.columns:
|
| 51 |
-
df_summary[COST_COLUMN_SUMMARY] = pd.to_numeric(df_summary[COST_COLUMN_SUMMARY], errors='coerce') * 100
|
| 52 |
df_summary.rename(columns={COST_COLUMN_SUMMARY: NEW_COST_COLUMN_SUMMARY}, inplace=True)
|
| 53 |
-
print(f"Converted '{COST_COLUMN_SUMMARY}' to Cents and renamed to '{NEW_COST_COLUMN_SUMMARY}' in df_summary.")
|
| 54 |
else:
|
| 55 |
print(f"Warning: Column '{COST_COLUMN_SUMMARY}' not found in df_summary for conversion.")
|
| 56 |
|
|
@@ -61,8 +61,8 @@ if not df_cost.empty:
|
|
| 61 |
cost_cols = [col for col in df_cost.columns if col != model_col_name]
|
| 62 |
for col in cost_cols:
|
| 63 |
# Handle potential non-numeric data gracefully before multiplying
|
| 64 |
-
df_cost[col] = pd.to_numeric(df_cost[col], errors='coerce') * 100
|
| 65 |
-
print("Converted cost breakdown columns to Cents in df_cost.")
|
| 66 |
# --- *** End of Cost Conversion *** ---
|
| 67 |
|
| 68 |
# Rename columns for clarity if needed (example for summary)
|
|
@@ -75,22 +75,34 @@ try:
|
|
| 75 |
# 'Avg Answer Duration (sec)': 'Avg Latency (s)',
|
| 76 |
# 'P99 Answer Duration (sec)': 'P99 Latency (s)'
|
| 77 |
})
|
| 78 |
-
# Select and reorder columns for the main table
|
| 79 |
-
summary_cols_display = ['Model', '
|
| 80 |
# Filter to only columns that actually exist after loading and renaming
|
| 81 |
summary_cols_display = [col for col in summary_cols_display if col in df_summary.columns]
|
| 82 |
-
df_summary_display = df_summary[summary_cols_display]
|
| 83 |
|
| 84 |
-
#
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
else:
|
| 89 |
-
print("Warning: '
|
| 90 |
|
| 91 |
except KeyError as e:
|
| 92 |
-
print(f"Error preparing
|
| 93 |
-
df_summary_display = df_summary # Fallback
|
|
|
|
| 94 |
|
| 95 |
|
| 96 |
# --- Build Gradio App ---
|
|
@@ -98,35 +110,55 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
| 98 |
gr.Markdown("# AutoBench LLM Leaderboard")
|
| 99 |
gr.Markdown(
|
| 100 |
"Interactive leaderboard for AutoBench, where LLMs rank LLMs' responses. "
|
| 101 |
-
"Includes performance, cost, and latency metrics
|
| 102 |
"More info: [AutoBench Blog Post](https://huggingface.co/blog/PeterKruger/autobench)"
|
| 103 |
)
|
| 104 |
|
| 105 |
# --- Tab 1: Overall Ranking ---
|
| 106 |
with gr.Tab("Overall Ranking"):
|
| 107 |
gr.Markdown("## Overall Model Performance")
|
| 108 |
-
|
|
|
|
| 109 |
# Check if df_summary_display has data before rendering
|
| 110 |
if not df_summary_display.empty:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 111 |
gr.DataFrame(
|
| 112 |
-
|
| 113 |
-
|
|
|
|
| 114 |
interactive=True, # Allows sorting
|
| 115 |
# height=600 # Adjust height as needed
|
| 116 |
)
|
| 117 |
else:
|
| 118 |
gr.Markdown("_(Summary data failed to load or is empty. Please check `summary_data.csv`)_")
|
| 119 |
|
| 120 |
-
# --- Tab
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
with gr.Tab("Performance Plots"):
|
| 122 |
gr.Markdown("## Performance Visualizations")
|
| 123 |
gr.Markdown("Exploring relationships between AutoBench Rank, Latency, and Cost.")
|
| 124 |
|
| 125 |
# Scatter Plot 1 (using summary data)
|
| 126 |
gr.Markdown("### Rank vs. Average Cost")
|
| 127 |
-
if not df_summary.empty and '
|
| 128 |
# Filter out rows where essential plot data might be missing
|
| 129 |
-
plot_df = df_summary.dropna(subset=['
|
| 130 |
plot_df[NEW_COST_COLUMN_SUMMARY] = pd.to_numeric(plot_df[NEW_COST_COLUMN_SUMMARY], errors='coerce')
|
| 131 |
plot_df = plot_df.dropna(subset=[NEW_COST_COLUMN_SUMMARY]) # Drop if cost conversion failed
|
| 132 |
|
|
@@ -134,12 +166,12 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
| 134 |
fig_cost = px.scatter(
|
| 135 |
plot_df,
|
| 136 |
x=NEW_COST_COLUMN_SUMMARY,
|
| 137 |
-
y="
|
| 138 |
text="Model", # Show model name near point
|
| 139 |
log_x=True, # Use log scale for cost
|
| 140 |
-
title="AutoBench Rank vs. Average Cost per Response (
|
| 141 |
-
labels={'
|
| 142 |
-
hover_data=['Model', '
|
| 143 |
)
|
| 144 |
fig_cost.update_traces(textposition='top center')
|
| 145 |
fig_cost.update_layout(
|
|
@@ -160,15 +192,15 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
| 160 |
)
|
| 161 |
gr.Plot(fig_cost)
|
| 162 |
else:
|
| 163 |
-
gr.Markdown("_(Insufficient valid data for Rank vs Cost plot. Check '
|
| 164 |
else:
|
| 165 |
gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs Cost plot)_")
|
| 166 |
|
| 167 |
# Plot 2: Rank vs Average Latency
|
| 168 |
gr.Markdown("### Rank vs. Average Latency")
|
| 169 |
-
if not df_summary.empty and '
|
| 170 |
# Filter out rows where essential plot data might be missing
|
| 171 |
-
plot_df_avg_latency = df_summary.dropna(subset=['
|
| 172 |
plot_df_avg_latency['Avg Answer Duration (sec)'] = pd.to_numeric(plot_df_avg_latency['Avg Answer Duration (sec)'], errors='coerce')
|
| 173 |
plot_df_avg_latency = plot_df_avg_latency.dropna(subset=['Avg Answer Duration (sec)']) # Drop if conversion failed
|
| 174 |
|
|
@@ -176,27 +208,27 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
| 176 |
fig_avg_latency = px.scatter(
|
| 177 |
plot_df_avg_latency,
|
| 178 |
x="Avg Answer Duration (sec)",
|
| 179 |
-
y="
|
| 180 |
text="Model",
|
| 181 |
log_x=True, # Use log scale for latency - adjust if not desired
|
| 182 |
title="AutoBench Rank vs. Average Latency (Log Scale)",
|
| 183 |
-
labels={'
|
| 184 |
-
hover_data=['Model', '
|
| 185 |
)
|
| 186 |
fig_avg_latency.update_traces(textposition='top center')
|
| 187 |
fig_avg_latency.update_layout(xaxis_title="Avg Latency (s) - Log Scale", yaxis_title="AutoBench Rank", width=1000, height=800)
|
| 188 |
gr.Plot(fig_avg_latency)
|
| 189 |
else:
|
| 190 |
-
gr.Markdown("_(Insufficient valid data for Rank vs Avg Latency plot. Check '
|
| 191 |
else:
|
| 192 |
gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs Avg Latency plot)_")
|
| 193 |
|
| 194 |
|
| 195 |
# Plot 3: Rank vs P99 Latency
|
| 196 |
gr.Markdown("### Rank vs. P99 Latency")
|
| 197 |
-
if not df_summary.empty and '
|
| 198 |
# Filter out rows where essential plot data might be missing
|
| 199 |
-
plot_df_p99_latency = df_summary.dropna(subset=['
|
| 200 |
plot_df_p99_latency['P99 Answer Duration (sec)'] = pd.to_numeric(plot_df_p99_latency['P99 Answer Duration (sec)'], errors='coerce')
|
| 201 |
plot_df_p99_latency = plot_df_p99_latency.dropna(subset=['P99 Answer Duration (sec)']) # Drop if conversion failed
|
| 202 |
|
|
@@ -204,18 +236,18 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
| 204 |
fig_p99_latency = px.scatter(
|
| 205 |
plot_df_p99_latency,
|
| 206 |
x="P99 Answer Duration (sec)",
|
| 207 |
-
y="
|
| 208 |
text="Model",
|
| 209 |
log_x=True, # Use log scale for latency - adjust if not desired
|
| 210 |
title="AutoBench Rank vs. P99 Latency (Log Scale)",
|
| 211 |
-
labels={'
|
| 212 |
-
hover_data=['Model', '
|
| 213 |
)
|
| 214 |
fig_p99_latency.update_traces(textposition='top center')
|
| 215 |
fig_p99_latency.update_layout(xaxis_title="P99 Latency (s) - Log Scale", yaxis_title="AutoBench Rank", width=1000, height=800)
|
| 216 |
gr.Plot(fig_p99_latency)
|
| 217 |
else:
|
| 218 |
-
gr.Markdown("_(Insufficient valid data for Rank vs P99 Latency plot. Check '
|
| 219 |
else:
|
| 220 |
gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs P99 Latency plot)_")
|
| 221 |
|
|
@@ -224,7 +256,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
| 224 |
gr.Markdown("## Performance vs. Cost/Latency Trade-offs")
|
| 225 |
|
| 226 |
# Cost Breakdown Table
|
| 227 |
-
gr.Markdown("### Cost Breakdown per Domain (
|
| 228 |
if not df_cost.empty:
|
| 229 |
# Make model name the first column if it exists
|
| 230 |
if 'model_name' in df_cost.columns:
|
|
@@ -293,7 +325,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
|
|
| 293 |
* **Avg Cost (USD Cents/response):** Estimated average cost to generate one response based on model provider pricing (input+output tokens). Lower is better.
|
| 294 |
* **Avg Latency (s):** Average time taken by the model to generate a response. Lower is better.
|
| 295 |
* **P99 Latency (s):** The 99th percentile of response time, indicating worst-case latency. Lower is better.
|
| 296 |
-
* **
|
| 297 |
|
| 298 |
### Data
|
| 299 |
This leaderboard reflects a run completed on April 23, 2025. Models included recently released models such as o4-mini, Gpt-4.1-mini, Gemini 2.5 Pro Preview, Claude 3.7 Sonnet:thikning, etc..
|
|
|
|
| 42 |
df_p99_latency = load_data(P99_LATENCY_FILE)
|
| 43 |
print("Data loading complete.")
|
| 44 |
|
| 45 |
+
# --- *** NEW: Convert Costs to USD Cents *** ---
|
| 46 |
COST_COLUMN_SUMMARY = 'Costs (USD)' # IMPORTANT: Check this matches your summary_data.csv header EXACTLY
|
| 47 |
+
NEW_COST_COLUMN_SUMMARY = 'Avg Cost ($ Cents)' # This is the new name we'll use
|
| 48 |
|
| 49 |
# Convert summary cost
|
| 50 |
if not df_summary.empty and COST_COLUMN_SUMMARY in df_summary.columns:
|
| 51 |
+
df_summary[COST_COLUMN_SUMMARY] = (pd.to_numeric(df_summary[COST_COLUMN_SUMMARY], errors='coerce') * 100).round(3) # <-- ADDED .round(3)
|
| 52 |
df_summary.rename(columns={COST_COLUMN_SUMMARY: NEW_COST_COLUMN_SUMMARY}, inplace=True)
|
| 53 |
+
print(f"Converted '{COST_COLUMN_SUMMARY}' to $ Cents and renamed to '{NEW_COST_COLUMN_SUMMARY}' in df_summary.")
|
| 54 |
else:
|
| 55 |
print(f"Warning: Column '{COST_COLUMN_SUMMARY}' not found in df_summary for conversion.")
|
| 56 |
|
|
|
|
| 61 |
cost_cols = [col for col in df_cost.columns if col != model_col_name]
|
| 62 |
for col in cost_cols:
|
| 63 |
# Handle potential non-numeric data gracefully before multiplying
|
| 64 |
+
df_cost[col] = (pd.to_numeric(df_cost[col], errors='coerce') * 100).round(3) # <-- ADDED .round(3)
|
| 65 |
+
print("Converted cost breakdown columns to $ Cents in df_cost.")
|
| 66 |
# --- *** End of Cost Conversion *** ---
|
| 67 |
|
| 68 |
# Rename columns for clarity if needed (example for summary)
|
|
|
|
| 75 |
# 'Avg Answer Duration (sec)': 'Avg Latency (s)',
|
| 76 |
# 'P99 Answer Duration (sec)': 'P99 Latency (s)'
|
| 77 |
})
|
| 78 |
+
# Select and reorder columns for the main table - REMOVED BENCHMARK COLUMNS
|
| 79 |
+
summary_cols_display = ['Model', 'AutoBench', NEW_COST_COLUMN_SUMMARY, 'Avg Answer Duration (sec)', 'P99 Answer Duration (sec)']
|
| 80 |
# Filter to only columns that actually exist after loading and renaming
|
| 81 |
summary_cols_display = [col for col in summary_cols_display if col in df_summary.columns]
|
| 82 |
+
df_summary_display = df_summary[summary_cols_display].copy() # Use .copy() to avoid SettingWithCopyWarning
|
| 83 |
|
| 84 |
+
# Select columns for the new benchmark comparison table
|
| 85 |
+
benchmark_cols = ['Model', 'AutoBench', 'Chatbot Ar.', 'AAI Index', 'MMLU Index']
|
| 86 |
+
benchmark_cols = [col for col in benchmark_cols if col in df_summary.columns] # Filter existing
|
| 87 |
+
df_benchmark_display = df_summary[benchmark_cols].copy() # Use .copy()
|
| 88 |
+
|
| 89 |
+
# Ensure AutoBench score is numeric for sorting BOTH display tables
|
| 90 |
+
if 'AutoBench' in df_summary_display.columns:
|
| 91 |
+
df_summary_display['AutoBench'] = pd.to_numeric(df_summary_display['AutoBench'], errors='coerce')
|
| 92 |
+
df_summary_display.sort_values(by='AutoBench', ascending=False, inplace=True) # Use inplace=True
|
| 93 |
+
else:
|
| 94 |
+
print("Warning: 'AutoBench' column not found for sorting summary table.")
|
| 95 |
+
|
| 96 |
+
if 'AutoBench' in df_benchmark_display.columns:
|
| 97 |
+
df_benchmark_display['AutoBench'] = pd.to_numeric(df_benchmark_display['AutoBench'], errors='coerce')
|
| 98 |
+
df_benchmark_display.sort_values(by='AutoBench', ascending=False, inplace=True) # Use inplace=True
|
| 99 |
else:
|
| 100 |
+
print("Warning: 'AutoBench' column not found for sorting benchmark table.")
|
| 101 |
|
| 102 |
except KeyError as e:
|
| 103 |
+
print(f"Error preparing display columns: Missing key {e}. Check CSV headers and rename mapping.")
|
| 104 |
+
df_summary_display = df_summary.copy() # Fallback
|
| 105 |
+
df_benchmark_display = pd.DataFrame() # Fallback to empty for benchmark table
|
| 106 |
|
| 107 |
|
| 108 |
# --- Build Gradio App ---
|
|
|
|
| 110 |
gr.Markdown("# AutoBench LLM Leaderboard")
|
| 111 |
gr.Markdown(
|
| 112 |
"Interactive leaderboard for AutoBench, where LLMs rank LLMs' responses. "
|
| 113 |
+
"Includes performance, cost, and latency metrics.\\n"
|
| 114 |
"More info: [AutoBench Blog Post](https://huggingface.co/blog/PeterKruger/autobench)"
|
| 115 |
)
|
| 116 |
|
| 117 |
# --- Tab 1: Overall Ranking ---
|
| 118 |
with gr.Tab("Overall Ranking"):
|
| 119 |
gr.Markdown("## Overall Model Performance")
|
| 120 |
+
# REMOVED benchmark correlations from Markdown
|
| 121 |
+
gr.Markdown("Models ranked by AutoBench score. Lower cost ($ Cents) and latency (s) are better.")
|
| 122 |
# Check if df_summary_display has data before rendering
|
| 123 |
if not df_summary_display.empty:
|
| 124 |
+
# Create a copy specifically for this tab's display and rename the column
|
| 125 |
+
df_overall_rank_display = df_summary_display.copy()
|
| 126 |
+
if 'AutoBench' in df_overall_rank_display.columns:
|
| 127 |
+
df_overall_rank_display.rename(columns={'AutoBench': 'Rank'}, inplace=True)
|
| 128 |
+
|
| 129 |
gr.DataFrame(
|
| 130 |
+
df_overall_rank_display, # Pass the renamed DF
|
| 131 |
+
# Adjust datatype length based on potentially fewer columns
|
| 132 |
+
datatype=['str'] + ['number'] * (len(df_overall_rank_display.columns) - 1),
|
| 133 |
interactive=True, # Allows sorting
|
| 134 |
# height=600 # Adjust height as needed
|
| 135 |
)
|
| 136 |
else:
|
| 137 |
gr.Markdown("_(Summary data failed to load or is empty. Please check `summary_data.csv`)_")
|
| 138 |
|
| 139 |
+
# --- NEW Tab 1.5: Benchmark Comparison ---
|
| 140 |
+
with gr.Tab("Benchmark Comparison"):
|
| 141 |
+
gr.Markdown("## Benchmark Comparison")
|
| 142 |
+
gr.Markdown("Comparison of AutoBench scores with other popular benchmarks (Chatbot Arena, Artificial Analysis Index, MMLU Index). Models sorted by AutoBench score.")
|
| 143 |
+
if not df_benchmark_display.empty:
|
| 144 |
+
gr.DataFrame(
|
| 145 |
+
df_benchmark_display,
|
| 146 |
+
datatype=['str'] + ['number'] * (len(df_benchmark_display.columns) - 1),
|
| 147 |
+
interactive=True # Allow sorting
|
| 148 |
+
)
|
| 149 |
+
else:
|
| 150 |
+
gr.Markdown("_(Benchmark comparison data could not be prepared. Check `summary_data.csv` for 'Chatbot Ar.', 'AAI Index', 'MMLU Index' columns.)_")
|
| 151 |
+
|
| 152 |
+
# --- Tab 2: Performance Plots ---
|
| 153 |
with gr.Tab("Performance Plots"):
|
| 154 |
gr.Markdown("## Performance Visualizations")
|
| 155 |
gr.Markdown("Exploring relationships between AutoBench Rank, Latency, and Cost.")
|
| 156 |
|
| 157 |
# Scatter Plot 1 (using summary data)
|
| 158 |
gr.Markdown("### Rank vs. Average Cost")
|
| 159 |
+
if not df_summary.empty and 'AutoBench' in df_summary.columns and NEW_COST_COLUMN_SUMMARY in df_summary.columns:
|
| 160 |
# Filter out rows where essential plot data might be missing
|
| 161 |
+
plot_df = df_summary.dropna(subset=['AutoBench', NEW_COST_COLUMN_SUMMARY, 'Model']).copy()
|
| 162 |
plot_df[NEW_COST_COLUMN_SUMMARY] = pd.to_numeric(plot_df[NEW_COST_COLUMN_SUMMARY], errors='coerce')
|
| 163 |
plot_df = plot_df.dropna(subset=[NEW_COST_COLUMN_SUMMARY]) # Drop if cost conversion failed
|
| 164 |
|
|
|
|
| 166 |
fig_cost = px.scatter(
|
| 167 |
plot_df,
|
| 168 |
x=NEW_COST_COLUMN_SUMMARY,
|
| 169 |
+
y="AutoBench",
|
| 170 |
text="Model", # Show model name near point
|
| 171 |
log_x=True, # Use log scale for cost
|
| 172 |
+
title="AutoBench Rank vs. Average Cost per Response ($ Cents - Log Scale)",
|
| 173 |
+
labels={'AutoBench': 'AutoBench Rank', NEW_COST_COLUMN_SUMMARY: 'Avg Cost ($ Cents) - Log Scale'},
|
| 174 |
+
hover_data=['Model', 'AutoBench', NEW_COST_COLUMN_SUMMARY, 'Avg Answer Duration (sec)'] # Show details on hover
|
| 175 |
)
|
| 176 |
fig_cost.update_traces(textposition='top center')
|
| 177 |
fig_cost.update_layout(
|
|
|
|
| 192 |
)
|
| 193 |
gr.Plot(fig_cost)
|
| 194 |
else:
|
| 195 |
+
gr.Markdown("_(Insufficient valid data for Rank vs Cost plot. Check 'AutoBench' and NEW_COST_COLUMN_SUMMARY columns in `summary_data.csv`)_")
|
| 196 |
else:
|
| 197 |
gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs Cost plot)_")
|
| 198 |
|
| 199 |
# Plot 2: Rank vs Average Latency
|
| 200 |
gr.Markdown("### Rank vs. Average Latency")
|
| 201 |
+
if not df_summary.empty and 'AutoBench' in df_summary.columns and 'Avg Answer Duration (sec)' in df_summary.columns:
|
| 202 |
# Filter out rows where essential plot data might be missing
|
| 203 |
+
plot_df_avg_latency = df_summary.dropna(subset=['AutoBench', 'Avg Answer Duration (sec)', 'Model']).copy()
|
| 204 |
plot_df_avg_latency['Avg Answer Duration (sec)'] = pd.to_numeric(plot_df_avg_latency['Avg Answer Duration (sec)'], errors='coerce')
|
| 205 |
plot_df_avg_latency = plot_df_avg_latency.dropna(subset=['Avg Answer Duration (sec)']) # Drop if conversion failed
|
| 206 |
|
|
|
|
| 208 |
fig_avg_latency = px.scatter(
|
| 209 |
plot_df_avg_latency,
|
| 210 |
x="Avg Answer Duration (sec)",
|
| 211 |
+
y="AutoBench",
|
| 212 |
text="Model",
|
| 213 |
log_x=True, # Use log scale for latency - adjust if not desired
|
| 214 |
title="AutoBench Rank vs. Average Latency (Log Scale)",
|
| 215 |
+
labels={'AutoBench': 'AutoBench Rank', 'Avg Answer Duration (sec)': 'Avg Latency (s) - Log Scale'},
|
| 216 |
+
hover_data=['Model', 'AutoBench', 'Avg Answer Duration (sec)', NEW_COST_COLUMN_SUMMARY]
|
| 217 |
)
|
| 218 |
fig_avg_latency.update_traces(textposition='top center')
|
| 219 |
fig_avg_latency.update_layout(xaxis_title="Avg Latency (s) - Log Scale", yaxis_title="AutoBench Rank", width=1000, height=800)
|
| 220 |
gr.Plot(fig_avg_latency)
|
| 221 |
else:
|
| 222 |
+
gr.Markdown("_(Insufficient valid data for Rank vs Avg Latency plot. Check 'AutoBench' and 'Avg Answer Duration (sec)' columns in `summary_data.csv`)_")
|
| 223 |
else:
|
| 224 |
gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs Avg Latency plot)_")
|
| 225 |
|
| 226 |
|
| 227 |
# Plot 3: Rank vs P99 Latency
|
| 228 |
gr.Markdown("### Rank vs. P99 Latency")
|
| 229 |
+
if not df_summary.empty and 'AutoBench' in df_summary.columns and 'P99 Answer Duration (sec)' in df_summary.columns:
|
| 230 |
# Filter out rows where essential plot data might be missing
|
| 231 |
+
plot_df_p99_latency = df_summary.dropna(subset=['AutoBench', 'P99 Answer Duration (sec)', 'Model']).copy()
|
| 232 |
plot_df_p99_latency['P99 Answer Duration (sec)'] = pd.to_numeric(plot_df_p99_latency['P99 Answer Duration (sec)'], errors='coerce')
|
| 233 |
plot_df_p99_latency = plot_df_p99_latency.dropna(subset=['P99 Answer Duration (sec)']) # Drop if conversion failed
|
| 234 |
|
|
|
|
| 236 |
fig_p99_latency = px.scatter(
|
| 237 |
plot_df_p99_latency,
|
| 238 |
x="P99 Answer Duration (sec)",
|
| 239 |
+
y="AutoBench",
|
| 240 |
text="Model",
|
| 241 |
log_x=True, # Use log scale for latency - adjust if not desired
|
| 242 |
title="AutoBench Rank vs. P99 Latency (Log Scale)",
|
| 243 |
+
labels={'AutoBench': 'AutoBench Rank', 'P99 Answer Duration (sec)': 'P99 Latency (s) - Log Scale'},
|
| 244 |
+
hover_data=['Model', 'AutoBench', 'P99 Answer Duration (sec)', 'Avg Answer Duration (sec)', NEW_COST_COLUMN_SUMMARY]
|
| 245 |
)
|
| 246 |
fig_p99_latency.update_traces(textposition='top center')
|
| 247 |
fig_p99_latency.update_layout(xaxis_title="P99 Latency (s) - Log Scale", yaxis_title="AutoBench Rank", width=1000, height=800)
|
| 248 |
gr.Plot(fig_p99_latency)
|
| 249 |
else:
|
| 250 |
+
gr.Markdown("_(Insufficient valid data for Rank vs P99 Latency plot. Check 'AutoBench' and 'P99 Answer Duration (sec)' columns in `summary_data.csv`)_")
|
| 251 |
else:
|
| 252 |
gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs P99 Latency plot)_")
|
| 253 |
|
|
|
|
| 256 |
gr.Markdown("## Performance vs. Cost/Latency Trade-offs")
|
| 257 |
|
| 258 |
# Cost Breakdown Table
|
| 259 |
+
gr.Markdown("### Cost Breakdown per Domain ($ Cents/Response)") # <-- MODIFIED
|
| 260 |
if not df_cost.empty:
|
| 261 |
# Make model name the first column if it exists
|
| 262 |
if 'model_name' in df_cost.columns:
|
|
|
|
| 325 |
* **Avg Cost (USD Cents/response):** Estimated average cost to generate one response based on model provider pricing (input+output tokens). Lower is better.
|
| 326 |
* **Avg Latency (s):** Average time taken by the model to generate a response. Lower is better.
|
| 327 |
* **P99 Latency (s):** The 99th percentile of response time, indicating worst-case latency. Lower is better.
|
| 328 |
+
* **Chatbot Arena / Artificial Analysis Intelligence Index / MMLU:** Scores from other well-known benchmarks for comparison (where available).
|
| 329 |
|
| 330 |
### Data
|
| 331 |
This leaderboard reflects a run completed on April 23, 2025. Models included recently released models such as o4-mini, Gpt-4.1-mini, Gemini 2.5 Pro Preview, Claude 3.7 Sonnet:thikning, etc..
|
data/summary_data.csv
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
-
Model,
|
| 2 |
-
claude-3.5-haiku-20241022,3.99,1237,34740,0.634,0.00182703,10.
|
| 3 |
claude-3.7-sonnet,4.2,1293,48150,0.803,0.01133934,15.53,32.86
|
| 4 |
-
claude-3.7-sonnet:thinking,4.39,1303,57390,0.837,0.0431979,45.
|
| 5 |
deepSeek-R1,4.26,1358,60220,0.844,0.00515901,84.77,223.47
|
| 6 |
deepSeek-V3,4.09,1318,45580,0.752,0.00094273,34.57,106.53
|
| 7 |
deepSeek-V3-0324,4.16,1372,53240,0.819,0.00102168,42.28,140.54
|
|
@@ -13,7 +13,7 @@ gpt-4o-mini,4,1272,35680,0.648,0.00038653,12.17,21.75
|
|
| 13 |
grok-2-1212,4.1,1288,39230,0.709,0.00847157,11.74,23.32
|
| 14 |
grok-3-beta,4.34,1402,50630,0.799,0.01694996,33.94,69.79
|
| 15 |
llama-3.1-Nemotron-70B-Instruct-HF,4.18,1269,37280,,0.00038647,25.04,48.74
|
| 16 |
-
llama-3.3-70B-Instruct,4.02,1257,41110,0.713,0.00035565,31.03,73.
|
| 17 |
llama-3_1-Nemotron-Ultra-253B-v1,4.26,,,0.69,0.0031635,43.84,94.45
|
| 18 |
llama-4-Maverick-17B-128E-Instruct-FP8,4,1271,50530,0.809,0.00067195,9.76,23.11
|
| 19 |
llama-4-Scout-17B-16E-Instruct,4,,42990,0.752,0.000477,8.49,13.82
|
|
@@ -22,5 +22,5 @@ mistral-small-24b-instruct-2501,3.88,1217,35280,0.652,0.00012061,13.99,29.62
|
|
| 22 |
nova-lite-v1,3.89,1217,32530,0.59,0.00015889,5.22,12.47
|
| 23 |
nova-pro-v1,3.83,1245,37080,0.691,0.0013758,5.65,9.93
|
| 24 |
o3-mini-2025-01-31,4.26,1305,62860,0.791,0.00612595,10.69,23.67
|
| 25 |
-
o4-mini-2025-04-16,4.57,,69830,0.832,0.00792985,19.
|
| 26 |
-
qwen-plus,4.17,1310,,,0.00094732,34.73,66.
|
|
|
|
| 1 |
+
Model,AutoBench,Chatbot Ar.,AAI Index,MMLU Index,Costs (USD),Avg Answer Duration (sec),P99 Answer Duration (sec)
|
| 2 |
+
claude-3.5-haiku-20241022,3.99,1237,34740,0.634,0.00182703,10.8,17.98
|
| 3 |
claude-3.7-sonnet,4.2,1293,48150,0.803,0.01133934,15.53,32.86
|
| 4 |
+
claude-3.7-sonnet:thinking,4.39,1303,57390,0.837,0.0431979,45.8,82.6
|
| 5 |
deepSeek-R1,4.26,1358,60220,0.844,0.00515901,84.77,223.47
|
| 6 |
deepSeek-V3,4.09,1318,45580,0.752,0.00094273,34.57,106.53
|
| 7 |
deepSeek-V3-0324,4.16,1372,53240,0.819,0.00102168,42.28,140.54
|
|
|
|
| 13 |
grok-2-1212,4.1,1288,39230,0.709,0.00847157,11.74,23.32
|
| 14 |
grok-3-beta,4.34,1402,50630,0.799,0.01694996,33.94,69.79
|
| 15 |
llama-3.1-Nemotron-70B-Instruct-HF,4.18,1269,37280,,0.00038647,25.04,48.74
|
| 16 |
+
llama-3.3-70B-Instruct,4.02,1257,41110,0.713,0.00035565,31.03,73.7
|
| 17 |
llama-3_1-Nemotron-Ultra-253B-v1,4.26,,,0.69,0.0031635,43.84,94.45
|
| 18 |
llama-4-Maverick-17B-128E-Instruct-FP8,4,1271,50530,0.809,0.00067195,9.76,23.11
|
| 19 |
llama-4-Scout-17B-16E-Instruct,4,,42990,0.752,0.000477,8.49,13.82
|
|
|
|
| 22 |
nova-lite-v1,3.89,1217,32530,0.59,0.00015889,5.22,12.47
|
| 23 |
nova-pro-v1,3.83,1245,37080,0.691,0.0013758,5.65,9.93
|
| 24 |
o3-mini-2025-01-31,4.26,1305,62860,0.791,0.00612595,10.69,23.67
|
| 25 |
+
o4-mini-2025-04-16,4.57,,69830,0.832,0.00792985,19.1,52.3
|
| 26 |
+
qwen-plus,4.17,1310,,,0.00094732,34.73,66.7
|