Spaces:

AutoBench
/

AutoBench-Leaderboard

Running

App Files Files Community

PeterKruger commited on Apr 28

Commit

e4f522a

1 Parent(s): 61cd432

optinization of the leaderboard

Browse files

Files changed (2) hide show

app.py +74 -42
data/summary_data.csv +6 -6

app.py CHANGED Viewed

@@ -42,15 +42,15 @@ df_avg_latency = load_data(AVG_LATENCY_FILE)
 df_p99_latency = load_data(P99_LATENCY_FILE)
 print("Data loading complete.")
-# --- *** NEW: Convert Costs to Cents *** ---
 COST_COLUMN_SUMMARY = 'Costs (USD)' # IMPORTANT: Check this matches your summary_data.csv header EXACTLY
-NEW_COST_COLUMN_SUMMARY = 'Avg Cost (Cents)' # This is the new name we'll use
 # Convert summary cost
 if not df_summary.empty and COST_COLUMN_SUMMARY in df_summary.columns:
-    df_summary[COST_COLUMN_SUMMARY] = pd.to_numeric(df_summary[COST_COLUMN_SUMMARY], errors='coerce') * 100
     df_summary.rename(columns={COST_COLUMN_SUMMARY: NEW_COST_COLUMN_SUMMARY}, inplace=True)
-    print(f"Converted '{COST_COLUMN_SUMMARY}' to Cents and renamed to '{NEW_COST_COLUMN_SUMMARY}' in df_summary.")
 else:
     print(f"Warning: Column '{COST_COLUMN_SUMMARY}' not found in df_summary for conversion.")
@@ -61,8 +61,8 @@ if not df_cost.empty:
     cost_cols = [col for col in df_cost.columns if col != model_col_name]
     for col in cost_cols:
         # Handle potential non-numeric data gracefully before multiplying
-        df_cost[col] = pd.to_numeric(df_cost[col], errors='coerce') * 100
-    print("Converted cost breakdown columns to Cents in df_cost.")
 # --- *** End of Cost Conversion *** ---
 # Rename columns for clarity if needed (example for summary)
@@ -75,22 +75,34 @@ try:
         # 'Avg Answer Duration (sec)': 'Avg Latency (s)',
         # 'P99 Answer Duration (sec)': 'P99 Latency (s)'
     })
-    # Select and reorder columns for the main table
-    summary_cols_display = ['Model', 'AB', 'CBA', 'AAII', 'MMLU', NEW_COST_COLUMN_SUMMARY, 'Avg Answer Duration (sec)', 'P99 Answer Duration (sec)'] # <-- MODIFIED
     # Filter to only columns that actually exist after loading and renaming
     summary_cols_display = [col for col in summary_cols_display if col in df_summary.columns]
-    df_summary_display = df_summary[summary_cols_display]
-    # Ensure AB score is numeric for sorting
-    if 'AB' in df_summary_display.columns:
-        df_summary_display['AB'] = pd.to_numeric(df_summary_display['AB'], errors='coerce')
-        df_summary_display = df_summary_display.sort_values(by='AB', ascending=False)
     else:
-        print("Warning: 'AB' column not found for sorting summary table.")
 except KeyError as e:
-    print(f"Error preparing summary display columns: Missing key {e}. Check CSV headers and rename mapping.")
-    df_summary_display = df_summary # Fallback to raw loaded data
 # --- Build Gradio App ---
@@ -98,35 +110,55 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
     gr.Markdown("# AutoBench LLM Leaderboard")
     gr.Markdown(
         "Interactive leaderboard for AutoBench, where LLMs rank LLMs' responses. "
-        "Includes performance, cost, and latency metrics.\n"
         "More info: [AutoBench Blog Post](https://huggingface.co/blog/PeterKruger/autobench)"
         )
     # --- Tab 1: Overall Ranking ---
     with gr.Tab("Overall Ranking"):
         gr.Markdown("## Overall Model Performance")
-        gr.Markdown("Models ranked by AutoBench score. Correlations: AB vs CBA: 83.74%, AB vs AAII: 72.49%. (Lower cost [Cents]/latency is better).") # <-- MODIFIED
         # Check if df_summary_display has data before rendering
         if not df_summary_display.empty:
              gr.DataFrame(
-                df_summary_display,
-                datatype=['str'] + ['number'] * (len(df_summary_display.columns) - 1), # Assume first col is text, rest numbers
                 interactive=True, # Allows sorting
                 # height=600 # Adjust height as needed
              )
         else:
              gr.Markdown("_(Summary data failed to load or is empty. Please check `summary_data.csv`)_")
-# --- Tab 2: Performance Plots ---
     with gr.Tab("Performance Plots"):
         gr.Markdown("## Performance Visualizations")
         gr.Markdown("Exploring relationships between AutoBench Rank, Latency, and Cost.")
         # Scatter Plot 1 (using summary data)
         gr.Markdown("### Rank vs. Average Cost")
-        if not df_summary.empty and 'AB' in df_summary.columns and NEW_COST_COLUMN_SUMMARY in df_summary.columns:
             # Filter out rows where essential plot data might be missing
-            plot_df = df_summary.dropna(subset=['AB', NEW_COST_COLUMN_SUMMARY, 'Model']).copy()
             plot_df[NEW_COST_COLUMN_SUMMARY] = pd.to_numeric(plot_df[NEW_COST_COLUMN_SUMMARY], errors='coerce')
             plot_df = plot_df.dropna(subset=[NEW_COST_COLUMN_SUMMARY]) # Drop if cost conversion failed
@@ -134,12 +166,12 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
                 fig_cost = px.scatter(
                     plot_df,
                     x=NEW_COST_COLUMN_SUMMARY,
-                    y="AB",
                     text="Model", # Show model name near point
                     log_x=True, # Use log scale for cost
-                    title="AutoBench Rank vs. Average Cost per Response (USD Cents - Log Scale)",
-                    labels={'AB': 'AutoBench Rank', NEW_COST_COLUMN_SUMMARY: 'Avg Cost (USD Cents) - Log Scale'},
-                    hover_data=['Model', 'AB', NEW_COST_COLUMN_SUMMARY, 'Avg Answer Duration (sec)'] # Show details on hover
                 )
                 fig_cost.update_traces(textposition='top center')
                 fig_cost.update_layout(
@@ -160,15 +192,15 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
                 )
                 gr.Plot(fig_cost)
             else:
-                gr.Markdown("_(Insufficient valid data for Rank vs Cost plot. Check 'AB' and NEW_COST_COLUMN_SUMMARY columns in `summary_data.csv`)_")
         else:
              gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs Cost plot)_")
         # Plot 2: Rank vs Average Latency
         gr.Markdown("### Rank vs. Average Latency")
-        if not df_summary.empty and 'AB' in df_summary.columns and 'Avg Answer Duration (sec)' in df_summary.columns:
             # Filter out rows where essential plot data might be missing
-            plot_df_avg_latency = df_summary.dropna(subset=['AB', 'Avg Answer Duration (sec)', 'Model']).copy()
             plot_df_avg_latency['Avg Answer Duration (sec)'] = pd.to_numeric(plot_df_avg_latency['Avg Answer Duration (sec)'], errors='coerce')
             plot_df_avg_latency = plot_df_avg_latency.dropna(subset=['Avg Answer Duration (sec)']) # Drop if conversion failed
@@ -176,27 +208,27 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
                 fig_avg_latency = px.scatter(
                     plot_df_avg_latency,
                     x="Avg Answer Duration (sec)",
-                    y="AB",
                     text="Model",
                     log_x=True,  # Use log scale for latency - adjust if not desired
                     title="AutoBench Rank vs. Average Latency (Log Scale)",
-                    labels={'AB': 'AutoBench Rank', 'Avg Answer Duration (sec)': 'Avg Latency (s) - Log Scale'},
-                    hover_data=['Model', 'AB', 'Avg Answer Duration (sec)', NEW_COST_COLUMN_SUMMARY]
                 )
                 fig_avg_latency.update_traces(textposition='top center')
                 fig_avg_latency.update_layout(xaxis_title="Avg Latency (s) - Log Scale", yaxis_title="AutoBench Rank", width=1000, height=800)
                 gr.Plot(fig_avg_latency)
             else:
-                gr.Markdown("_(Insufficient valid data for Rank vs Avg Latency plot. Check 'AB' and 'Avg Answer Duration (sec)' columns in `summary_data.csv`)_")
         else:
              gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs Avg Latency plot)_")
         # Plot 3: Rank vs P99 Latency
         gr.Markdown("### Rank vs. P99 Latency")
-        if not df_summary.empty and 'AB' in df_summary.columns and 'P99 Answer Duration (sec)' in df_summary.columns:
             # Filter out rows where essential plot data might be missing
-            plot_df_p99_latency = df_summary.dropna(subset=['AB', 'P99 Answer Duration (sec)', 'Model']).copy()
             plot_df_p99_latency['P99 Answer Duration (sec)'] = pd.to_numeric(plot_df_p99_latency['P99 Answer Duration (sec)'], errors='coerce')
             plot_df_p99_latency = plot_df_p99_latency.dropna(subset=['P99 Answer Duration (sec)']) # Drop if conversion failed
@@ -204,18 +236,18 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
                 fig_p99_latency = px.scatter(
                     plot_df_p99_latency,
                     x="P99 Answer Duration (sec)",
-                    y="AB",
                     text="Model",
                     log_x=True,  # Use log scale for latency - adjust if not desired
                     title="AutoBench Rank vs. P99 Latency (Log Scale)",
-                    labels={'AB': 'AutoBench Rank', 'P99 Answer Duration (sec)': 'P99 Latency (s) - Log Scale'},
-                    hover_data=['Model', 'AB', 'P99 Answer Duration (sec)', 'Avg Answer Duration (sec)', NEW_COST_COLUMN_SUMMARY]
                 )
                 fig_p99_latency.update_traces(textposition='top center')
                 fig_p99_latency.update_layout(xaxis_title="P99 Latency (s) - Log Scale", yaxis_title="AutoBench Rank", width=1000, height=800)
                 gr.Plot(fig_p99_latency)
             else:
-                gr.Markdown("_(Insufficient valid data for Rank vs P99 Latency plot. Check 'AB' and 'P99 Answer Duration (sec)' columns in `summary_data.csv`)_")
         else:
              gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs P99 Latency plot)_")
@@ -224,7 +256,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
         gr.Markdown("## Performance vs. Cost/Latency Trade-offs")
         # Cost Breakdown Table
-        gr.Markdown("### Cost Breakdown per Domain (USD Cents/Response)") # <-- MODIFIED
         if not df_cost.empty:
             # Make model name the first column if it exists
             if 'model_name' in df_cost.columns:
@@ -293,7 +325,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as app:
         * **Avg Cost (USD Cents/response):** Estimated average cost to generate one response based on model provider pricing (input+output tokens). Lower is better.
         * **Avg Latency (s):** Average time taken by the model to generate a response. Lower is better.
         * **P99 Latency (s):** The 99th percentile of response time, indicating worst-case latency. Lower is better.
-        * **CBA / AAII / MMLU:** Scores from other well-known benchmarks for comparison (where available).
         ### Data
         This leaderboard reflects a run completed on April 23, 2025. Models included recently released models such as o4-mini, Gpt-4.1-mini, Gemini 2.5 Pro Preview, Claude 3.7 Sonnet:thikning, etc..

 df_p99_latency = load_data(P99_LATENCY_FILE)
 print("Data loading complete.")
+# --- *** NEW: Convert Costs to USD Cents *** ---
 COST_COLUMN_SUMMARY = 'Costs (USD)' # IMPORTANT: Check this matches your summary_data.csv header EXACTLY
+NEW_COST_COLUMN_SUMMARY = 'Avg Cost ($ Cents)' # This is the new name we'll use
 # Convert summary cost
 if not df_summary.empty and COST_COLUMN_SUMMARY in df_summary.columns:
+    df_summary[COST_COLUMN_SUMMARY] = (pd.to_numeric(df_summary[COST_COLUMN_SUMMARY], errors='coerce') * 100).round(3) # <-- ADDED .round(3)
     df_summary.rename(columns={COST_COLUMN_SUMMARY: NEW_COST_COLUMN_SUMMARY}, inplace=True)
+    print(f"Converted '{COST_COLUMN_SUMMARY}' to $ Cents and renamed to '{NEW_COST_COLUMN_SUMMARY}' in df_summary.")
 else:
     print(f"Warning: Column '{COST_COLUMN_SUMMARY}' not found in df_summary for conversion.")
     cost_cols = [col for col in df_cost.columns if col != model_col_name]
     for col in cost_cols:
         # Handle potential non-numeric data gracefully before multiplying
+        df_cost[col] = (pd.to_numeric(df_cost[col], errors='coerce') * 100).round(3) # <-- ADDED .round(3)
+    print("Converted cost breakdown columns to $ Cents in df_cost.")
 # --- *** End of Cost Conversion *** ---
 # Rename columns for clarity if needed (example for summary)
         # 'Avg Answer Duration (sec)': 'Avg Latency (s)',
         # 'P99 Answer Duration (sec)': 'P99 Latency (s)'
     })
+    # Select and reorder columns for the main table - REMOVED BENCHMARK COLUMNS
+    summary_cols_display = ['Model', 'AutoBench', NEW_COST_COLUMN_SUMMARY, 'Avg Answer Duration (sec)', 'P99 Answer Duration (sec)']
     # Filter to only columns that actually exist after loading and renaming
     summary_cols_display = [col for col in summary_cols_display if col in df_summary.columns]
+    df_summary_display = df_summary[summary_cols_display].copy() # Use .copy() to avoid SettingWithCopyWarning
+    # Select columns for the new benchmark comparison table
+    benchmark_cols = ['Model', 'AutoBench', 'Chatbot Ar.', 'AAI Index', 'MMLU Index']
+    benchmark_cols = [col for col in benchmark_cols if col in df_summary.columns] # Filter existing
+    df_benchmark_display = df_summary[benchmark_cols].copy() # Use .copy()
+    # Ensure AutoBench score is numeric for sorting BOTH display tables
+    if 'AutoBench' in df_summary_display.columns:
+        df_summary_display['AutoBench'] = pd.to_numeric(df_summary_display['AutoBench'], errors='coerce')
+        df_summary_display.sort_values(by='AutoBench', ascending=False, inplace=True) # Use inplace=True
+    else:
+        print("Warning: 'AutoBench' column not found for sorting summary table.")
+    if 'AutoBench' in df_benchmark_display.columns:
+        df_benchmark_display['AutoBench'] = pd.to_numeric(df_benchmark_display['AutoBench'], errors='coerce')
+        df_benchmark_display.sort_values(by='AutoBench', ascending=False, inplace=True) # Use inplace=True
     else:
+        print("Warning: 'AutoBench' column not found for sorting benchmark table.")
 except KeyError as e:
+    print(f"Error preparing display columns: Missing key {e}. Check CSV headers and rename mapping.")
+    df_summary_display = df_summary.copy() # Fallback
+    df_benchmark_display = pd.DataFrame() # Fallback to empty for benchmark table
 # --- Build Gradio App ---
     gr.Markdown("# AutoBench LLM Leaderboard")
     gr.Markdown(
         "Interactive leaderboard for AutoBench, where LLMs rank LLMs' responses. "
+        "Includes performance, cost, and latency metrics.\\n"
         "More info: [AutoBench Blog Post](https://huggingface.co/blog/PeterKruger/autobench)"
         )
     # --- Tab 1: Overall Ranking ---
     with gr.Tab("Overall Ranking"):
         gr.Markdown("## Overall Model Performance")
+        # REMOVED benchmark correlations from Markdown
+        gr.Markdown("Models ranked by AutoBench score. Lower cost ($ Cents) and latency (s) are better.")
         # Check if df_summary_display has data before rendering
         if not df_summary_display.empty:
+             # Create a copy specifically for this tab's display and rename the column
+             df_overall_rank_display = df_summary_display.copy()
+             if 'AutoBench' in df_overall_rank_display.columns:
+                 df_overall_rank_display.rename(columns={'AutoBench': 'Rank'}, inplace=True)
              gr.DataFrame(
+                df_overall_rank_display, # Pass the renamed DF
+                # Adjust datatype length based on potentially fewer columns
+                datatype=['str'] + ['number'] * (len(df_overall_rank_display.columns) - 1),
                 interactive=True, # Allows sorting
                 # height=600 # Adjust height as needed
              )
         else:
              gr.Markdown("_(Summary data failed to load or is empty. Please check `summary_data.csv`)_")
+    # --- NEW Tab 1.5: Benchmark Comparison ---
+    with gr.Tab("Benchmark Comparison"):
+        gr.Markdown("## Benchmark Comparison")
+        gr.Markdown("Comparison of AutoBench scores with other popular benchmarks (Chatbot Arena, Artificial Analysis Index, MMLU Index). Models sorted by AutoBench score.")
+        if not df_benchmark_display.empty:
+            gr.DataFrame(
+                df_benchmark_display,
+                datatype=['str'] + ['number'] * (len(df_benchmark_display.columns) - 1),
+                interactive=True # Allow sorting
+            )
+        else:
+            gr.Markdown("_(Benchmark comparison data could not be prepared. Check `summary_data.csv` for 'Chatbot Ar.', 'AAI Index', 'MMLU Index' columns.)_")
+    # --- Tab 2: Performance Plots ---
     with gr.Tab("Performance Plots"):
         gr.Markdown("## Performance Visualizations")
         gr.Markdown("Exploring relationships between AutoBench Rank, Latency, and Cost.")
         # Scatter Plot 1 (using summary data)
         gr.Markdown("### Rank vs. Average Cost")
+        if not df_summary.empty and 'AutoBench' in df_summary.columns and NEW_COST_COLUMN_SUMMARY in df_summary.columns:
             # Filter out rows where essential plot data might be missing
+            plot_df = df_summary.dropna(subset=['AutoBench', NEW_COST_COLUMN_SUMMARY, 'Model']).copy()
             plot_df[NEW_COST_COLUMN_SUMMARY] = pd.to_numeric(plot_df[NEW_COST_COLUMN_SUMMARY], errors='coerce')
             plot_df = plot_df.dropna(subset=[NEW_COST_COLUMN_SUMMARY]) # Drop if cost conversion failed
                 fig_cost = px.scatter(
                     plot_df,
                     x=NEW_COST_COLUMN_SUMMARY,
+                    y="AutoBench",
                     text="Model", # Show model name near point
                     log_x=True, # Use log scale for cost
+                    title="AutoBench Rank vs. Average Cost per Response ($ Cents - Log Scale)",
+                    labels={'AutoBench': 'AutoBench Rank', NEW_COST_COLUMN_SUMMARY: 'Avg Cost ($ Cents) - Log Scale'},
+                    hover_data=['Model', 'AutoBench', NEW_COST_COLUMN_SUMMARY, 'Avg Answer Duration (sec)'] # Show details on hover
                 )
                 fig_cost.update_traces(textposition='top center')
                 fig_cost.update_layout(
                 )
                 gr.Plot(fig_cost)
             else:
+                gr.Markdown("_(Insufficient valid data for Rank vs Cost plot. Check 'AutoBench' and NEW_COST_COLUMN_SUMMARY columns in `summary_data.csv`)_")
         else:
              gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs Cost plot)_")
         # Plot 2: Rank vs Average Latency
         gr.Markdown("### Rank vs. Average Latency")
+        if not df_summary.empty and 'AutoBench' in df_summary.columns and 'Avg Answer Duration (sec)' in df_summary.columns:
             # Filter out rows where essential plot data might be missing
+            plot_df_avg_latency = df_summary.dropna(subset=['AutoBench', 'Avg Answer Duration (sec)', 'Model']).copy()
             plot_df_avg_latency['Avg Answer Duration (sec)'] = pd.to_numeric(plot_df_avg_latency['Avg Answer Duration (sec)'], errors='coerce')
             plot_df_avg_latency = plot_df_avg_latency.dropna(subset=['Avg Answer Duration (sec)']) # Drop if conversion failed
                 fig_avg_latency = px.scatter(
                     plot_df_avg_latency,
                     x="Avg Answer Duration (sec)",
+                    y="AutoBench",
                     text="Model",
                     log_x=True,  # Use log scale for latency - adjust if not desired
                     title="AutoBench Rank vs. Average Latency (Log Scale)",
+                    labels={'AutoBench': 'AutoBench Rank', 'Avg Answer Duration (sec)': 'Avg Latency (s) - Log Scale'},
+                    hover_data=['Model', 'AutoBench', 'Avg Answer Duration (sec)', NEW_COST_COLUMN_SUMMARY]
                 )
                 fig_avg_latency.update_traces(textposition='top center')
                 fig_avg_latency.update_layout(xaxis_title="Avg Latency (s) - Log Scale", yaxis_title="AutoBench Rank", width=1000, height=800)
                 gr.Plot(fig_avg_latency)
             else:
+                gr.Markdown("_(Insufficient valid data for Rank vs Avg Latency plot. Check 'AutoBench' and 'Avg Answer Duration (sec)' columns in `summary_data.csv`)_")
         else:
              gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs Avg Latency plot)_")
         # Plot 3: Rank vs P99 Latency
         gr.Markdown("### Rank vs. P99 Latency")
+        if not df_summary.empty and 'AutoBench' in df_summary.columns and 'P99 Answer Duration (sec)' in df_summary.columns:
             # Filter out rows where essential plot data might be missing
+            plot_df_p99_latency = df_summary.dropna(subset=['AutoBench', 'P99 Answer Duration (sec)', 'Model']).copy()
             plot_df_p99_latency['P99 Answer Duration (sec)'] = pd.to_numeric(plot_df_p99_latency['P99 Answer Duration (sec)'], errors='coerce')
             plot_df_p99_latency = plot_df_p99_latency.dropna(subset=['P99 Answer Duration (sec)']) # Drop if conversion failed
                 fig_p99_latency = px.scatter(
                     plot_df_p99_latency,
                     x="P99 Answer Duration (sec)",
+                    y="AutoBench",
                     text="Model",
                     log_x=True,  # Use log scale for latency - adjust if not desired
                     title="AutoBench Rank vs. P99 Latency (Log Scale)",
+                    labels={'AutoBench': 'AutoBench Rank', 'P99 Answer Duration (sec)': 'P99 Latency (s) - Log Scale'},
+                    hover_data=['Model', 'AutoBench', 'P99 Answer Duration (sec)', 'Avg Answer Duration (sec)', NEW_COST_COLUMN_SUMMARY]
                 )
                 fig_p99_latency.update_traces(textposition='top center')
                 fig_p99_latency.update_layout(xaxis_title="P99 Latency (s) - Log Scale", yaxis_title="AutoBench Rank", width=1000, height=800)
                 gr.Plot(fig_p99_latency)
             else:
+                gr.Markdown("_(Insufficient valid data for Rank vs P99 Latency plot. Check 'AutoBench' and 'P99 Answer Duration (sec)' columns in `summary_data.csv`)_")
         else:
              gr.Markdown("_(Summary data failed to load or essential columns missing for Rank vs P99 Latency plot)_")
         gr.Markdown("## Performance vs. Cost/Latency Trade-offs")
         # Cost Breakdown Table
+        gr.Markdown("### Cost Breakdown per Domain ($ Cents/Response)") # <-- MODIFIED
         if not df_cost.empty:
             # Make model name the first column if it exists
             if 'model_name' in df_cost.columns:
         * **Avg Cost (USD Cents/response):** Estimated average cost to generate one response based on model provider pricing (input+output tokens). Lower is better.
         * **Avg Latency (s):** Average time taken by the model to generate a response. Lower is better.
         * **P99 Latency (s):** The 99th percentile of response time, indicating worst-case latency. Lower is better.
+        * **Chatbot Arena / Artificial Analysis Intelligence Index / MMLU:** Scores from other well-known benchmarks for comparison (where available).
         ### Data
         This leaderboard reflects a run completed on April 23, 2025. Models included recently released models such as o4-mini, Gpt-4.1-mini, Gemini 2.5 Pro Preview, Claude 3.7 Sonnet:thikning, etc..

data/summary_data.csv CHANGED Viewed

@@ -1,7 +1,7 @@
-Model,AB,CBA,AAII,MMLU,Costs (USD),Avg Answer Duration (sec),P99 Answer Duration (sec)
-claude-3.5-haiku-20241022,3.99,1237,34740,0.634,0.00182703,10.80,17.98
 claude-3.7-sonnet,4.2,1293,48150,0.803,0.01133934,15.53,32.86
-claude-3.7-sonnet:thinking,4.39,1303,57390,0.837,0.0431979,45.80,82.60
 deepSeek-R1,4.26,1358,60220,0.844,0.00515901,84.77,223.47
 deepSeek-V3,4.09,1318,45580,0.752,0.00094273,34.57,106.53
 deepSeek-V3-0324,4.16,1372,53240,0.819,0.00102168,42.28,140.54
@@ -13,7 +13,7 @@ gpt-4o-mini,4,1272,35680,0.648,0.00038653,12.17,21.75
 grok-2-1212,4.1,1288,39230,0.709,0.00847157,11.74,23.32
 grok-3-beta,4.34,1402,50630,0.799,0.01694996,33.94,69.79
 llama-3.1-Nemotron-70B-Instruct-HF,4.18,1269,37280,,0.00038647,25.04,48.74
-llama-3.3-70B-Instruct,4.02,1257,41110,0.713,0.00035565,31.03,73.70
 llama-3_1-Nemotron-Ultra-253B-v1,4.26,,,0.69,0.0031635,43.84,94.45
 llama-4-Maverick-17B-128E-Instruct-FP8,4,1271,50530,0.809,0.00067195,9.76,23.11
 llama-4-Scout-17B-16E-Instruct,4,,42990,0.752,0.000477,8.49,13.82
@@ -22,5 +22,5 @@ mistral-small-24b-instruct-2501,3.88,1217,35280,0.652,0.00012061,13.99,29.62
 nova-lite-v1,3.89,1217,32530,0.59,0.00015889,5.22,12.47
 nova-pro-v1,3.83,1245,37080,0.691,0.0013758,5.65,9.93
 o3-mini-2025-01-31,4.26,1305,62860,0.791,0.00612595,10.69,23.67
-o4-mini-2025-04-16,4.57,,69830,0.832,0.00792985,19.10,52.30
-qwen-plus,4.17,1310,,,0.00094732,34.73,66.70

+Model,AutoBench,Chatbot Ar.,AAI Index,MMLU Index,Costs (USD),Avg Answer Duration (sec),P99 Answer Duration (sec)
+claude-3.5-haiku-20241022,3.99,1237,34740,0.634,0.00182703,10.8,17.98
 claude-3.7-sonnet,4.2,1293,48150,0.803,0.01133934,15.53,32.86
+claude-3.7-sonnet:thinking,4.39,1303,57390,0.837,0.0431979,45.8,82.6
 deepSeek-R1,4.26,1358,60220,0.844,0.00515901,84.77,223.47
 deepSeek-V3,4.09,1318,45580,0.752,0.00094273,34.57,106.53
 deepSeek-V3-0324,4.16,1372,53240,0.819,0.00102168,42.28,140.54
 grok-2-1212,4.1,1288,39230,0.709,0.00847157,11.74,23.32
 grok-3-beta,4.34,1402,50630,0.799,0.01694996,33.94,69.79
 llama-3.1-Nemotron-70B-Instruct-HF,4.18,1269,37280,,0.00038647,25.04,48.74
+llama-3.3-70B-Instruct,4.02,1257,41110,0.713,0.00035565,31.03,73.7
 llama-3_1-Nemotron-Ultra-253B-v1,4.26,,,0.69,0.0031635,43.84,94.45
 llama-4-Maverick-17B-128E-Instruct-FP8,4,1271,50530,0.809,0.00067195,9.76,23.11
 llama-4-Scout-17B-16E-Instruct,4,,42990,0.752,0.000477,8.49,13.82
 nova-lite-v1,3.89,1217,32530,0.59,0.00015889,5.22,12.47
 nova-pro-v1,3.83,1245,37080,0.691,0.0013758,5.65,9.93
 o3-mini-2025-01-31,4.26,1305,62860,0.791,0.00612595,10.69,23.67
+o4-mini-2025-04-16,4.57,,69830,0.832,0.00792985,19.1,52.3
+qwen-plus,4.17,1310,,,0.00094732,34.73,66.7