|
|
import gradio as gr |
|
|
import pandas as pd |
|
|
import plotly.express as px |
|
|
import os |
|
|
|
|
|
|
|
|
DATA_DIR = "." |
|
|
SUMMARY_FILE = os.path.join(DATA_DIR, "data/summary_data.csv") |
|
|
DOMAIN_RANKS_FILE = os.path.join(DATA_DIR, "data/domain_ranks.csv") |
|
|
COST_FILE = os.path.join(DATA_DIR, "data/cost_data.csv") |
|
|
AVG_LATENCY_FILE = os.path.join(DATA_DIR, "data/avg_latency.csv") |
|
|
P99_LATENCY_FILE = os.path.join(DATA_DIR, "data/p99_latency.csv") |
|
|
|
|
|
|
|
|
def load_data(filepath, separator=','): |
|
|
"""Loads data, handling potential file not found errors.""" |
|
|
if not os.path.exists(filepath): |
|
|
print(f"Warning: Data file not found at {filepath}") |
|
|
return pd.DataFrame() |
|
|
try: |
|
|
|
|
|
df = pd.read_csv(filepath, sep=separator) |
|
|
|
|
|
df = df.loc[:, ~df.columns.str.contains('^Unnamed')] |
|
|
|
|
|
for col in df.columns: |
|
|
if col != 'Model Name' and col != 'model_name': |
|
|
|
|
|
if df[col].astype(str).str.contains(r'^[0-9.,eE-]+$').any(): |
|
|
df[col] = pd.to_numeric(df[col], errors='coerce') |
|
|
return df |
|
|
except Exception as e: |
|
|
print(f"Error loading {filepath}: {e}") |
|
|
return pd.DataFrame() |
|
|
|
|
|
|
|
|
print("Loading data...") |
|
|
df_summary = load_data(SUMMARY_FILE) |
|
|
df_domain = load_data(DOMAIN_RANKS_FILE) |
|
|
df_cost = load_data(COST_FILE) |
|
|
df_avg_latency = load_data(AVG_LATENCY_FILE) |
|
|
df_p99_latency = load_data(P99_LATENCY_FILE) |
|
|
print("Data loading complete.") |
|
|
|
|
|
|
|
|
COST_COLUMN_SUMMARY = 'Costs (USD)' |
|
|
NEW_COST_COLUMN_SUMMARY = 'Avg Cost ($ Cents)' |
|
|
|
|
|
|
|
|
if not df_summary.empty and COST_COLUMN_SUMMARY in df_summary.columns: |
|
|
df_summary[COST_COLUMN_SUMMARY] = (pd.to_numeric(df_summary[COST_COLUMN_SUMMARY], errors='coerce') * 100).round(3) |
|
|
df_summary.rename(columns={COST_COLUMN_SUMMARY: NEW_COST_COLUMN_SUMMARY}, inplace=True) |
|
|
print(f"Converted '{COST_COLUMN_SUMMARY}' to $ Cents and renamed to '{NEW_COST_COLUMN_SUMMARY}' in df_summary.") |
|
|
else: |
|
|
print(f"Warning: Column '{COST_COLUMN_SUMMARY}' not found in df_summary for conversion.") |
|
|
|
|
|
|
|
|
if not df_cost.empty: |
|
|
|
|
|
model_col_name = 'model_name' |
|
|
cost_cols = [col for col in df_cost.columns if col != model_col_name] |
|
|
for col in cost_cols: |
|
|
|
|
|
df_cost[col] = (pd.to_numeric(df_cost[col], errors='coerce') * 100).round(3) |
|
|
print("Converted cost breakdown columns to $ Cents in df_cost.") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
try: |
|
|
df_summary = df_summary.rename(columns={ |
|
|
'Model Name': 'Model', |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
}) |
|
|
|
|
|
summary_cols_display = ['Model', 'AutoBench', NEW_COST_COLUMN_SUMMARY, 'Avg Answer Duration (sec)', 'P99 Answer Duration (sec)'] |
|
|
|
|
|
summary_cols_display = [col for col in summary_cols_display if col in df_summary.columns] |
|
|
df_summary_display = df_summary[summary_cols_display].copy() |
|
|
|
|
|
|
|
|
benchmark_cols = ['Model', 'AutoBench', 'Chatbot Ar.', 'AAI Index', 'MMLU Index'] |
|
|
benchmark_cols = [col for col in benchmark_cols if col in df_summary.columns] |
|
|
df_benchmark_display = df_summary[benchmark_cols].copy() |
|
|
|
|
|
|
|
|
if 'AutoBench' in df_summary_display.columns: |
|
|
df_summary_display['AutoBench'] = pd.to_numeric(df_summary_display['AutoBench'], errors='coerce') |
|
|
df_summary_display.sort_values(by='AutoBench', ascending=False, inplace=True) |
|
|
else: |
|
|
print("Warning: 'AutoBench' column not found for sorting summary table.") |
|
|
|
|
|
if 'AutoBench' in df_benchmark_display.columns: |
|
|
df_benchmark_display['AutoBench'] = pd.to_numeric(df_benchmark_display['AutoBench'], errors='coerce') |
|
|
df_benchmark_display.sort_values(by='AutoBench', ascending=False, inplace=True) |
|
|
else: |
|
|
print("Warning: 'AutoBench' column not found for sorting benchmark table.") |
|
|
|
|
|
except KeyError as e: |
|
|
print(f"Error preparing display columns: Missing key {e}. Check CSV headers and rename mapping.") |
|
|
df_summary_display = df_summary.copy() |
|
|
df_benchmark_display = pd.DataFrame() |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as app: |
|
|
gr.Markdown("# AutoBench LLM Leaderboard") |
|
|
gr.Markdown( |
|
|
"Interactive leaderboard for AutoBench, where LLMs rank LLMs' responses. " |
|
|
"Includes performance, cost, and latency metrics.\\n" |
|
|
"More info: [AutoBench Blog Post](https://huggingface.co/blog/PeterKruger/autobench)" |
|
|
) |
|
|
|
|
|
|
|
|
with gr.Tab("Overall Ranking"): |
|
|
gr.Markdown("## Overall Model Performance") |
|
|
|
|
|
gr.Markdown("Models ranked by AutoBench score. Lower cost ($ Cents) and latency (s) are better.") |
|
|
|
|
|
if not df_summary_display.empty: |
|
|
|
|
|
df_overall_rank_display = df_summary_display.copy() |
|
|
if 'AutoBench' in df_overall_rank_display.columns: |
|
|
df_overall_rank_display.rename(columns={'AutoBench': 'Rank'}, inplace=True) |
|
|
|
|
|
gr.DataFrame( |
|
|
df_overall_rank_display, |
|
|
|
|
|
datatype=['str'] + ['number'] * (len(df_overall_rank_display.columns) - 1), |
|
|
interactive=True, |
|
|
|
|
|
) |
|
|
else: |
|
|
gr.Markdown("_(Summary data failed to load or is empty. Please check `summary_data.csv`)_") |
|
|
|
|
|
|
|
|
with gr.Tab("Benchmark Comparison"): |
|
|
gr.Markdown("## Benchmark Comparison") |
|
|
gr.Markdown("Comparison of AutoBench scores with other popular benchmarks (Chatbot Arena, Artificial Analysis Index, MMLU Index). Models sorted by AutoBench score.") |
|
|
if not df_benchmark_display.empty: |
|
|
gr.DataFrame( |
|
|
df_benchmark_display, |
|
|
datatype=['str'] + ['number'] * (len(df_benchmark_display.columns) - 1), |
|
|
interactive=True |
|
|
) |
|
|
else: |
|
|
gr.Markdown("_(Benchmark comparison data could not be prepared. Check `summary_data.csv` for 'Chatbot Ar.', 'AAI Index', 'MMLU Index' columns.)_") |
|
|
|
|
|
|
|
|
with gr.Tab("Performance Plots"): |
|
|
gr.Markdown("## Performance Visualizations") |
|
|
gr.Markdown("Exploring relationships between AutoBench Rank, Latency, and Cost.") |
|
|
|
|
|
|
|
|
gr.Markdown("### Rank vs. Average Cost") |
|
|
if not df_summary.empty and 'AutoBench' in df_summary.columns and NEW_COST_COLUMN_SUMMARY in df_summary.columns: |
|
|
|
|
|
plot_df = df_summary.dropna(subset=['AutoBench', NEW_COST_COLUMN_SUMMARY, 'Model']).copy() |
|
|
plot_df[NEW_COST_COLUMN_SUMMARY] = pd.to_numeric(plot_df[NEW_COST_COLUMN_SUMMARY], errors='coerce') |
|
|
plot_df = plot_df.dropna(subset=[NEW_COST_COLUMN_SUMMARY]) |
|
|
|
|
|
if not plot_df.empty: |
|
|
fig_cost = px.scatter( |
|
|
plot_df, |
|
|
x=NEW_COST_COLUMN_SUMMARY, |
|
|
y="AutoBench", |
|
|
text="Model", |
|
|
log_x=True, |
|
|
title="AutoBench Rank vs. Average Cost per Response ($ Cents - Log Scale)", |
|
|
labels={'AutoBench': 'AutoBench Rank', NEW_COST_COLUMN_SUMMARY: 'Avg Cost ($ Cents) - Log Scale'}, |
|
|
hover_data=['Model', 'AutoBench', NEW_COST_COLUMN_SUMMARY, 'Avg Answer Duration (sec)'] |
|
|
) |
|
|
fig_cost.update_traces(textposition='top center') |
|
|
fig_cost.update_layout( |
|
|
xaxis_title="Avg Cost ($ Cents) - Log Scale", |
|
|
yaxis_title="AutoBench Rank", |
|
|
width=1000, |
|
|
height=800, |
|
|
|
|
|
xaxis2=dict( |
|
|
overlaying='x', |
|
|
matches='x', |
|
|
side='top', |
|
|
showticklabels=True, |
|
|
showline=True, |
|
|
title=None |
|
|
) |
|
|
|
|
|
) |
|
|
gr.Plot(fig_cost) |
|
|
else: |
|
|
gr.Markdown("_(Insufficient valid data for Rank vs Cost plot. Check 'AutoBench' and NEW_COST_COLUMN_SUMMARY columns in `summary_data.csv`)_") |
|
|
else: |
|
|
gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs Cost plot)_") |
|
|
|
|
|
|
|
|
gr.Markdown("### Rank vs. Average Latency") |
|
|
if not df_summary.empty and 'AutoBench' in df_summary.columns and 'Avg Answer Duration (sec)' in df_summary.columns: |
|
|
|
|
|
plot_df_avg_latency = df_summary.dropna(subset=['AutoBench', 'Avg Answer Duration (sec)', 'Model']).copy() |
|
|
plot_df_avg_latency['Avg Answer Duration (sec)'] = pd.to_numeric(plot_df_avg_latency['Avg Answer Duration (sec)'], errors='coerce') |
|
|
plot_df_avg_latency = plot_df_avg_latency.dropna(subset=['Avg Answer Duration (sec)']) |
|
|
|
|
|
if not plot_df_avg_latency.empty: |
|
|
fig_avg_latency = px.scatter( |
|
|
plot_df_avg_latency, |
|
|
x="Avg Answer Duration (sec)", |
|
|
y="AutoBench", |
|
|
text="Model", |
|
|
log_x=True, |
|
|
title="AutoBench Rank vs. Average Latency (Log Scale)", |
|
|
labels={'AutoBench': 'AutoBench Rank', 'Avg Answer Duration (sec)': 'Avg Latency (s) - Log Scale'}, |
|
|
hover_data=['Model', 'AutoBench', 'Avg Answer Duration (sec)', NEW_COST_COLUMN_SUMMARY] |
|
|
) |
|
|
fig_avg_latency.update_traces(textposition='top center') |
|
|
fig_avg_latency.update_layout(xaxis_title="Avg Latency (s) - Log Scale", yaxis_title="AutoBench Rank", width=1000, height=800) |
|
|
gr.Plot(fig_avg_latency) |
|
|
else: |
|
|
gr.Markdown("_(Insufficient valid data for Rank vs Avg Latency plot. Check 'AutoBench' and 'Avg Answer Duration (sec)' columns in `summary_data.csv`)_") |
|
|
else: |
|
|
gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs Avg Latency plot)_") |
|
|
|
|
|
|
|
|
|
|
|
gr.Markdown("### Rank vs. P99 Latency") |
|
|
if not df_summary.empty and 'AutoBench' in df_summary.columns and 'P99 Answer Duration (sec)' in df_summary.columns: |
|
|
|
|
|
plot_df_p99_latency = df_summary.dropna(subset=['AutoBench', 'P99 Answer Duration (sec)', 'Model']).copy() |
|
|
plot_df_p99_latency['P99 Answer Duration (sec)'] = pd.to_numeric(plot_df_p99_latency['P99 Answer Duration (sec)'], errors='coerce') |
|
|
plot_df_p99_latency = plot_df_p99_latency.dropna(subset=['P99 Answer Duration (sec)']) |
|
|
|
|
|
if not plot_df_p99_latency.empty: |
|
|
fig_p99_latency = px.scatter( |
|
|
plot_df_p99_latency, |
|
|
x="P99 Answer Duration (sec)", |
|
|
y="AutoBench", |
|
|
text="Model", |
|
|
log_x=True, |
|
|
title="AutoBench Rank vs. P99 Latency (Log Scale)", |
|
|
labels={'AutoBench': 'AutoBench Rank', 'P99 Answer Duration (sec)': 'P99 Latency (s) - Log Scale'}, |
|
|
hover_data=['Model', 'AutoBench', 'P99 Answer Duration (sec)', 'Avg Answer Duration (sec)', NEW_COST_COLUMN_SUMMARY] |
|
|
) |
|
|
fig_p99_latency.update_traces(textposition='top center') |
|
|
fig_p99_latency.update_layout(xaxis_title="P99 Latency (s) - Log Scale", yaxis_title="AutoBench Rank", width=1000, height=800) |
|
|
gr.Plot(fig_p99_latency) |
|
|
else: |
|
|
gr.Markdown("_(Insufficient valid data for Rank vs P99 Latency plot. Check 'AutoBench' and 'P99 Answer Duration (sec)' columns in `summary_data.csv`)_") |
|
|
else: |
|
|
gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs P99 Latency plot)_") |
|
|
|
|
|
|
|
|
with gr.Tab("Cost & Latency Analysis"): |
|
|
gr.Markdown("## Performance vs. Cost/Latency Trade-offs") |
|
|
|
|
|
|
|
|
gr.Markdown("### Cost Breakdown per Domain ($ Cents/Response)") |
|
|
if not df_cost.empty: |
|
|
|
|
|
if 'model_name' in df_cost.columns: |
|
|
cols = ['model_name'] + [col for col in df_cost.columns if col != 'model_name'] |
|
|
df_cost_display = df_cost[cols] |
|
|
else: |
|
|
df_cost_display = df_cost |
|
|
gr.DataFrame(df_cost_display, interactive=True) |
|
|
else: |
|
|
gr.Markdown("_(Cost breakdown data failed to load or is empty. Please check `cost_data.csv`)_") |
|
|
|
|
|
|
|
|
gr.Markdown("### Average Latency Breakdown per Domain (Seconds)") |
|
|
if not df_avg_latency.empty: |
|
|
if 'model_name' in df_avg_latency.columns: |
|
|
cols = ['model_name'] + [col for col in df_avg_latency.columns if col != 'model_name'] |
|
|
df_avg_latency_display = df_avg_latency[cols] |
|
|
else: |
|
|
df_avg_latency_display = df_avg_latency |
|
|
gr.DataFrame(df_avg_latency_display, interactive=True) |
|
|
else: |
|
|
gr.Markdown("_(Average latency data failed to load or is empty. Please check `avg_latency.csv`)_") |
|
|
|
|
|
gr.Markdown("### P99 Latency Breakdown per Domain (Seconds)") |
|
|
if not df_p99_latency.empty: |
|
|
if 'model_name' in df_p99_latency.columns: |
|
|
cols = ['model_name'] + [col for col in df_p99_latency.columns if col != 'model_name'] |
|
|
df_p99_latency_display = df_p99_latency[cols] |
|
|
else: |
|
|
df_p99_latency_display = df_p99_latency |
|
|
gr.DataFrame(df_p99_latency_display, interactive=True) |
|
|
else: |
|
|
gr.Markdown("_(P99 latency data failed to load or is empty. Please check `p99_latency.csv`)_") |
|
|
|
|
|
|
|
|
|
|
|
with gr.Tab("Domain Performance"): |
|
|
gr.Markdown("## Performance Across Different Domains") |
|
|
gr.Markdown("Model ranks within specific knowledge or task areas. Higher is better.") |
|
|
if not df_domain.empty: |
|
|
if 'Model Name' in df_domain.columns: |
|
|
|
|
|
cols = ['Model Name'] + [col for col in df_domain.columns if col != 'Model Name'] |
|
|
df_domain_display = df_domain[cols] |
|
|
else: |
|
|
df_domain_display = df_domain |
|
|
gr.DataFrame(df_domain_display, interactive=True) |
|
|
else: |
|
|
gr.Markdown("_(Domain ranks data failed to load or is empty. Please check `domain_ranks.csv`)_") |
|
|
|
|
|
|
|
|
with gr.Tab("About AutoBench"): |
|
|
gr.Markdown(""" |
|
|
## About AutoBench |
|
|
|
|
|
AutoBench is an LLM benchmark where Large Language Models (LLMs) evaluate and rank the responses generated by other LLMs. The questions themselves are also generated by LLMs across a diverse set of domains and ranked for quality. |
|
|
|
|
|
### Methodology |
|
|
1. **Question Generation:** High-quality questions across various domains (Coding, History, Science, etc.) are generated by capable LLMs. |
|
|
2. **Response Generation:** The models being benchmarked generate answers to these questions. |
|
|
3. **Ranking:** A high-capability LLM (e.g., GPT-4, Claude 3) ranks the responses from different models for each question, typically on a scale (e.g., 1-5). |
|
|
4. **Aggregation:** Scores are averaged across multiple questions and domains to produce the final AutoBench rank. |
|
|
|
|
|
### Metrics |
|
|
* **AutoBench Score (AB):** The average rank received by a model's responses across all questions/domains (higher is better). |
|
|
* **Avg Cost (USD Cents/response):** Estimated average cost to generate one response based on model provider pricing (input+output tokens). Lower is better. |
|
|
* **Avg Latency (s):** Average time taken by the model to generate a response. Lower is better. |
|
|
* **P99 Latency (s):** The 99th percentile of response time, indicating worst-case latency. Lower is better. |
|
|
* **Chatbot Arena / Artificial Analysis Intelligence Index / MMLU:** Scores from other well-known benchmarks for comparison (where available). |
|
|
|
|
|
### Data |
|
|
This leaderboard reflects a run completed on April 23, 2025. Models included recently released models such as o4-mini, Gpt-4.1-mini, Gemini 2.5 Pro Preview, Claude 3.7 Sonnet:thikning, etc.. |
|
|
|
|
|
### Links |
|
|
* [AutoBench Blog Post](https://huggingface.co/blog/PeterKruger/autobench) |
|
|
* [Leaderboard Source Code](https://huggingface.co/spaces/<your-username>/<your-space-name>/tree/main) |
|
|
|
|
|
**Disclaimer:** Benchmark results provide one perspective on model capabilities. Performance can vary based on specific tasks, prompts, and API conditions. Costs are estimates and subject to change by providers. Latency depends on server load and geographic location. |
|
|
""") |
|
|
|
|
|
|
|
|
print("Launching Gradio app...") |
|
|
app.launch() |
|
|
print("Gradio app launched.") |