PeterKruger commited on
Commit
838b31f
·
verified ·
1 Parent(s): 9302f81

Upload 7 files

Browse files
runs/run-2025-12-16/avg_latency.csv ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name,coding,creative writing,current news,general culture,grammar,history,logics,math,science,technology,Average (All Topics)
2
+ Claude-haiku-4.5,108.2169,77.9828,139.8157,119.3168,96.8819,118.5421,146.1804,93.1935,85.5203,128.0674,110.9457146
3
+ Claude-opus-4.5,205.1172,91.8738,160.2862,154.8248,103.0795,161.9298,168.8554,110.9826,119.3727,160.8702,144.0064821
4
+ Claude-sonnet-4.5,194.2746,97.991,221.7836,197.1376,123.5208,172.1705,224.1097,153.1433,126.5125,185.5291,169.7268622
5
+ DeepSeek-R1-0528,301.0484,49.4935,117.8214,101.4108,88.6982,100.734,306.8704,351.139,122.3086,142.4731,171.4999365
6
+ Deepseek-v3.2,170.3223,50.1187,130.1619,103.7197,86.2436,130.6725,219.3999,157.6304,86.1883,108.4466,124.5749929
7
+ Deepseek-v3.2-speciale,454.4097,373.5313,230.8446,213.1212,265.5541,201.2352,651.1625,388.4446,216.2238,251.4362,310.3903417
8
+ Gemini-2.5-flash,122.2311,28.9762,68.5994,58.9803,57.4608,53.4325,77.5255,70.8783,47.9233,65.6251,65.61706446
9
+ Gemini-2.5-flash-lite,27.6891,7.4738,22.1568,13.5681,17.9023,19.6096,31.563,27.4738,17.0569,16.9643,20.41722765
10
+ Gemini-2.5-pro,109.9939,37.5364,99.5693,87.053,65.1453,74.7684,125.0731,98.0517,75.3436,88.0257,86.79510542
11
+ Gemini-3-pro-preview,111.2193,41.256,68.7167,67.4612,63.8528,59.1828,138.2422,94.61,57.7557,63.3303,76.10772546
12
+ GLM-4.5-Air,250.8921,33.182,130.3234,118.605,102.7298,125.2574,275.6033,337.8757,103.0877,122.1762,163.1509648
13
+ GLM-4.6,187.9063,59.6837,192.016,172.9087,143.9757,161.7514,273.6536,325.9543,153.79,160.8635,187.4263836
14
+ Gpt-5.1,311.144,145.0151,239.826,201.6391,180.4055,222.0834,400.0297,232.0924,177.3329,190.6357,227.425965
15
+ Gpt-5.2,215.3879,72.5414,154.2086,108.3396,115.2593,120.9876,211.1343,111.2079,91.3683,111.1999,130.0950453
16
+ Gpt-5.2-pro,391.1151,208.1175,226.3653,193.8801,223.697,204.9777,576.6401,317.3142,156.8561,194.8504,261.3839264
17
+ Gpt-5-mini,130.2666,49.2126,109.1312,89.203,88.5769,85.6279,103.3383,82.3936,86.7269,102.7151,93.48742858
18
+ Gpt-5-nano,111.2797,56.9614,97.445,93.6769,104.4473,90.7587,149.9171,109.4526,89.6858,94.049,99.62428955
19
+ Gpt-oss-120b,85.5721,30.2059,107.3813,77.7369,54.7208,80.8943,119.5328,48.964,66.8172,85.8429,75.47582399
20
+ Gpt-oss-20b,48.2448,14.5999,37.9083,32.8224,38.5637,29.7853,64.6509,47.4871,31.6468,40.1316,38.76845801
21
+ Grok-4,298.5129,65.5425,144.6898,119.8769,154.5694,121.9038,391.5985,277.8614,135.1043,151.079,180.1110452
22
+ Grok-4.1-fast,22.5872,10.5228,30.5169,29.9234,22.2147,27.6289,22.9019,14.2277,23.0097,30.2953,23.59507062
23
+ Grok-4.1-fast-thinking,99.8118,27.5942,50.9788,55.0636,58.7765,53.4979,147.2731,93.0965,58.3833,58.0907,69.23819567
24
+ Kimi-k2-0905,96.0417,49.851,87.0009,91.0837,57.9927,103.5506,129.4537,56.8817,67.2098,94.3569,82.79755057
25
+ Kimi-k2-thinking,357.8956,211.4411,187.6676,174.5778,156.1618,192.8184,406.495,400.0537,189.5434,217.8827,247.9683578
26
+ Llama-3.3-nemotron-super-49b-v1.5,115.7753,45.765,54.3973,51.8132,44.5401,58.9116,150.4951,135.1039,57.9696,50.2643,76.47567281
27
+ Minimax-m2,258.3904,80.0205,108.2872,89.6492,87.1081,80.6304,210.7194,236.3157,116.3629,82.6208,136.9629944
28
+ Ministral-8b-2512,28.8863,22.0955,41.2578,28.7331,21.6026,29.6067,54.5365,24.612,32.0725,33.7791,31.40083599
29
+ Mistral-large-2512,79.2901,28.5084,126.2813,111.757,80.9586,115.3766,70.4993,54.8645,93.7809,116.9721,89.96343603
30
+ Mistral-medium-3.1,37.2227,15.6724,81.6154,70.4164,46.7696,67.9571,43.1978,23.2783,55.3481,69.7366,52.24553215
31
+ Nemotron-nano-9b-v2,124.962,41.4807,47.0715,36.6855,47.2949,41.3541,127.955,107.3714,40.8651,55.1499,66.77738031
32
+ Nova-2-lite-v1,86.8579,56.826,67.0827,61.455,57.903,54.8436,51.9657,53.2645,60.4673,59.8649,61.45748847
33
+ Nova-premier-v1,59.2369,19.7099,61.1146,61.1911,38.9898,55.9562,49.0279,50.0873,49.4701,62.2549,51.84074232
34
+ Qwen3-235b-a22b-2507,133.3347,35.1341,78.9824,71.2207,115.1465,81.8462,163.3372,184.0751,89.4845,78.7041,104.7811018
35
+ Qwen3-235B-A22B-Thinking-2507,548.9492,102.3888,246.6013,235.7818,240.6059,257.2741,631.6219,589.9518,236.4708,208.5225,316.8201599
36
+ Qwen3-next-80b-a3b-thinking,105.9201,48.549,77.5874,69.3386,85.093,65.7956,98.0509,85.3588,71.0135,68.6137,77.75939135
runs/run-2025-12-16/correlations.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "correlations": {
3
+ "LMArena": 69.19,
4
+ "Artificial Analysis Intelligence Index": 89.38,
5
+ "MMLU": 82.21
6
+
7
+ },
8
+ "description": "Correlation percentages between AutoBench scores and other benchmark scores"
9
+ }
runs/run-2025-12-16/cost_data.csv ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name,coding,creative writing,current news,general culture,grammar,history,logics,math,science,technology,Average (All Topics)
2
+ Claude-haiku-4.5,0.04796761,0.03238883,0.03107844,0.02638083,0.03072529,0.02659291,0.06254624,0.05931721,0.02862727,0.03348466,0.037873971
3
+ Claude-opus-4.5,0.29227484,0.14104891,0.12830818,0.13549917,0.11872339,0.13533474,0.25805,0.22583962,0.13629584,0.15803141,0.172599703
4
+ Claude-sonnet-4.5,0.14973639,0.07717565,0.10377029,0.0932825,0.07577582,0.08187862,0.1930656,0.17324842,0.08588089,0.10120519,0.113931502
5
+ DeepSeek-R1-0528,0.01875911,0.00303728,0.0047644,0.00431912,0.00500901,0.00403824,0.02015593,0.02310714,0.00674013,0.00709617,0.009866481
6
+ Deepseek-v3.2,0.00109186,0.00052365,0.00081581,0.00070466,0.00075338,0.00075529,0.00153214,0.00117388,0.00079126,0.00072429,0.00088526
7
+ Deepseek-v3.2-speciale,0.00672248,0.00675838,0.00320302,0.00289234,0.00444567,0.00283425,0.01029455,0.00628261,0.00329474,0.00279927,0.004672989
8
+ Gemini-2.5-flash,0.03803445,0.0117833,0.01362167,0.01224066,0.01833685,0.0121711,0.03863442,0.03681212,0.01540965,0.0149641,0.021217848
9
+ Gemini-2.5-flash-lite,0.00277171,0.00084387,0.00132463,0.00109807,0.00167517,0.00119299,0.00468076,0.00449945,0.00172071,0.00134208,0.002139796
10
+ Gemini-2.5-pro,0.08474181,0.03169886,0.04509337,0.04217825,0.04990143,0.04269468,0.13244895,0.11265404,0.05585932,0.04711418,0.064793075
11
+ Gemini-3-pro-preview,0.09641867,0.04151,0.0406917,0.0404174,0.05291838,0.03837331,0.16871096,0.12023964,0.05092595,0.04228494,0.068499733
12
+ GLM-4.5-Air,0.00820787,0.0013094,0.00317276,0.00338355,0.00377618,0.00329219,0.01023654,0.0127487,0.00321646,0.00340004,0.005355855
13
+ GLM-4.6,0.01399979,0.00462601,0.00890891,0.01057332,0.01132542,0.01130957,0.02189323,0.02268625,0.01008674,0.00841888,0.01253707
14
+ Gpt-5.1,0.13881786,0.07678016,0.09721818,0.10773247,0.09562121,0.09244821,0.16249342,0.11152067,0.08273436,0.12518665,0.108002358
15
+ Gpt-5.2,0.13035496,0.04910371,0.05832808,0.03989294,0.06298887,0.0454307,0.14882455,0.1013052,0.0541931,0.05409835,0.073564969
16
+ Gpt-5.2-pro,1.0698774,0.59090945,0.693588,0.54748576,0.80446333,0.5967108,1.50086726,1.25455216,0.57316151,0.63746686,0.818815856
17
+ Gpt-5-mini,0.01101629,0.0066571,0.00806052,0.00685722,0.0085612,0.00951664,0.01375456,0.00993965,0.00845582,0.00856096,0.009137171
18
+ Gpt-5-nano,0.00380816,0.00252091,0.00245592,0.00244178,0.00397283,0.00255242,0.00575112,0.00493667,0.00303911,0.00256656,0.003385019
19
+ Gpt-oss-120b,0.00116925,0.0005053,0.00281574,0.00083875,0.00076515,0.00093969,0.00147282,0.00104197,0.00088673,0.00105028,0.001149135
20
+ Gpt-oss-20b,0.00082608,0.00035786,0.00051247,0.00045562,0.00054506,0.00048785,0.00136087,0.00118873,0.00052532,0.00052515,0.000682808
21
+ Grok-4,0.13011086,0.03172909,0.04242855,0.040092,0.06734988,0.04288654,0.19895795,0.18167038,0.05777479,0.05267316,0.081239078
22
+ Grok-4.1-fast,0.00113864,0.00043969,0.00089972,0.00080692,0.00081899,0.00079567,0.00099131,0.00076386,0.00081525,0.00089737,0.000841039
23
+ Grok-4.1-fast-thinking,0.00413646,0.00118447,0.00132395,0.00122325,0.0020697,0.00128587,0.00746951,0.00541352,0.00215743,0.00164497,0.002722312
24
+ Kimi-k2-0905,0.0040027,0.00200646,0.00369414,0.00327289,0.00220109,0.00368381,0.00514003,0.0031238,0.00287381,0.00342115,0.003344045
25
+ Kimi-k2-thinking,0.02400572,0.01381101,0.01216847,0.01033478,0.01514665,0.01051941,0.0377184,0.03795372,0.01288832,0.01324748,0.018558145
26
+ Llama-3.3-nemotron-super-49b-v1.5,0.00300765,0.00122499,0.00098242,0.00099594,0.00108555,0.00106881,0.00381754,0.00365907,0.00140375,0.00112887,0.001833066
27
+ Minimax-m2,0.01213038,0.00553417,0.00320592,0.00289512,0.00458057,0.00287236,0.01794542,0.01389677,0.005267,0.00354698,0.00711788
28
+ Ministral-8b-2512,0.00046259,0.00026429,0.00054361,0.00049856,0.00041828,0.00048709,0.00079872,0.00051314,0.00042434,0.00049399,0.000489568
29
+ Mistral-large-2512,0.00471713,0.00195759,0.00594909,0.00554823,0.00492521,0.00553417,0.0054327,0.00507622,0.0053989,0.00568219,0.005120599
30
+ Mistral-medium-3.1,0.00336013,0.00167505,0.00473395,0.0041676,0.00330309,0.00418606,0.0048653,0.00326612,0.00349375,0.00414133,0.003751786
31
+ Nemotron-nano-9b-v2,0.00153706,0.00057663,0.00044255,0.00039797,0.0005314,0.00045132,0.00166813,0.00156872,0.00053541,0.00051641,0.000820591
32
+ Nova-2-lite-v1,0.07547446,0.04099998,0.03170261,0.0309826,0.0359062,0.02763011,0.04359764,0.04758953,0.03771458,0.03259245,0.039437834
33
+ Nova-premier-v1,0.01677879,0.00763352,0.01237621,0.01206467,0.01235429,0.01140471,0.0148664,0.01551314,0.01240074,0.01254813,0.012950513
34
+ Qwen3-235b-a22b-2507,0.00184113,0.00043658,0.00158505,0.00120764,0.00209282,0.00146099,0.00359504,0.00329687,0.00165955,0.00172153,0.001917167
35
+ Qwen3-235B-A22B-Thinking-2507,0.00549796,0.00113073,0.00201848,0.00202241,0.00223348,0.00210888,0.00760879,0.00690929,0.00217258,0.00206767,0.003170949
36
+ Qwen3-next-80b-a3b-thinking,0.0099701,0.00608854,0.00549343,0.00515869,0.00768631,0.00488623,0.01232633,0.01059962,0.00707064,0.00592575,0.007494371
runs/run-2025-12-16/domain_ranks.csv ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name,coding,creative writing,current news,general culture,grammar,history,logics,math,science,technology,Average (All Topics)
2
+ Claude-haiku-4.5,3.896,4.1798,4.4195,4.5162,4.3002,4.4807,3.7367,3.5258,4.1594,4.5444,4.170821
3
+ Claude-opus-4.5,4.3253,4.3694,4.1679,4.6497,4.4774,4.4952,4.1393,4.214,4.5653,4.5178,4.39496
4
+ Claude-sonnet-4.5,4.1231,4.5017,4.4807,4.5748,4.4423,4.4153,3.5709,3.8669,4.4844,4.5519,4.30218
5
+ DeepSeek-R1-0528,3.7053,4.0733,4.3708,4.4378,4.1089,4.4467,3.5519,3.6497,4.3068,4.4322,4.118577
6
+ Deepseek-v3.2,3.741,4.0014,4.5128,4.4394,4.1404,4.3174,3.4795,3.7235,4.201,4.3906,4.109586
7
+ Deepseek-v3.2-speciale,4.0964,3.7169,4.1727,4.2891,4.2365,4.3793,3.558,3.9619,4.3353,4.3067,4.141433
8
+ Gemini-2.5-flash,4.01,4.046,4.3665,4.215,4.3058,4.2863,3.8277,4.024,4.1208,4.4398,4.171935
9
+ Gemini-2.5-flash-lite,3.9154,4.0888,4.2033,4.15,3.7796,4.3708,3.2679,3.3294,4.0502,4.2879,3.94904
10
+ Gemini-2.5-pro,4.0197,4.1507,4.3678,4.4328,4.3717,4.4027,4.0175,4.2377,4.4451,4.3803,4.294065
11
+ Gemini-3-pro-preview,4.2254,4.352,4.5077,4.6974,4.2012,4.5579,4.123,4.2957,4.5101,4.4766,4.405224
12
+ GLM-4.5-Air,3.4011,3.6283,4.0442,4.4026,3.6135,4.1639,3.6893,3.466,3.8959,4.2471,3.864646
13
+ GLM-4.6,3.9502,4.154,4.1144,4.4669,4.1101,4.2879,3.4484,4.0146,4.2906,4.3261,4.132794
14
+ Gpt-5.1,4.1923,4.3363,4.4623,4.6766,4.5045,4.4358,4.0829,4.1333,4.4884,4.4885,4.38364
15
+ Gpt-5.2,4.2989,4.4394,4.5439,4.3957,4.3679,4.5977,4.1779,4.2614,4.5517,4.5929,4.430061
16
+ Gpt-5.2-pro,4.3745,4.4942,4.5626,4.6854,4.5249,4.5944,4.3153,4.2901,4.3928,4.5531,4.476206
17
+ Gpt-5-mini,4.18,4.3856,4.4526,4.4846,4.2918,4.2895,3.8962,4.0472,4.3236,4.5178,4.287269
18
+ Gpt-5-nano,3.9829,3.847,4.2798,4.2864,3.9098,4.1801,3.6059,3.873,4.2301,4.2294,4.060397
19
+ Gpt-oss-120b,4.2552,4.1312,4.0549,4.2073,4.0356,4.3422,3.8138,3.8927,4.4529,4.508,4.181097
20
+ Gpt-oss-20b,3.9278,3.8177,3.4139,3.777,3.5187,3.3964,3.8702,3.8189,4.1819,4.0654,3.779105
21
+ Grok-4,4.1127,4.2019,4.2275,4.258,4.2027,4.3413,4.1751,3.8096,4.2996,4.3325,4.197064
22
+ Grok-4.1-fast,3.2714,4.1437,4.1517,4.4933,4.0447,4.3487,3.1697,2.8397,4.162,4.22,3.877365
23
+ Grok-4.1-fast-thinking,3.8834,4.2013,4.3318,4.4323,4.178,4.3843,3.7149,3.8893,4.4811,4.3875,4.206201
24
+ Kimi-k2-0905,3.6093,4.2553,4.4514,4.6494,4.0671,4.5102,3.4043,3.4026,4.2795,4.4418,4.107852
25
+ Kimi-k2-thinking,4.0332,4.3655,4.4615,4.6033,4.2796,4.5443,3.8119,4.129,4.2333,4.5553,4.315342
26
+ Llama-3.3-nemotron-super-49b-v1.5,3.2916,3.8451,4.129,4.3489,3.6246,4.2599,3.1084,3.1014,3.8956,4.1353,3.783612
27
+ Minimax-m2,3.4588,3.6467,4.3288,4.2996,4.0075,4.1983,3.3369,3.9375,4.1415,4.2852,3.990557
28
+ Ministral-8b-2512,2.9477,3.5676,4.0966,4.1553,3.2352,3.9678,2.9408,2.7773,3.7756,4.1015,3.570151
29
+ Mistral-large-2512,3.5392,3.8875,4.135,4.4486,4.1053,4.3948,3.4126,3.1081,4.0114,4.2951,3.935105
30
+ Mistral-medium-3.1,3.025,3.9361,4.2094,4.0717,4.0296,4.4697,3.2035,2.8908,3.9827,4.2469,3.811798
31
+ Nemotron-nano-9b-v2,3.0723,3.0018,3.9185,4.0366,2.9714,3.9167,2.8622,3.0697,3.7831,4.0139,3.500291
32
+ Nova-2-lite-v1,3.7409,3.8832,4.2516,4.2551,4.0813,4.3824,3.0429,3.641,4.2797,4.3358,4.059981
33
+ Nova-premier-v1,2.8353,3.5547,3.7279,3.9576,3.3375,3.9765,2.5511,2.8474,3.8154,3.9793,3.473742
34
+ Qwen3-235b-a22b-2507,3.6516,3.842,4.0547,4.3213,3.9642,4.2631,3.2727,3.7025,4.289,4.2662,3.98095
35
+ Qwen3-235B-A22B-Thinking-2507,3.7017,4.2155,4.1925,4.5712,4.2408,4.4447,3.8836,3.7789,4.2573,4.4556,4.196769
36
+ Qwen3-next-80b-a3b-thinking,3.7023,3.7423,4.0543,4.4018,4.1316,4.2153,3.7258,3.7883,4.24,4.2008,4.031744
runs/run-2025-12-16/metadata.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "run_id": "run_2025-12-16",
3
+ "title": "AutoBench Run 5 - December 2025",
4
+ "date": "2025-12-16",
5
+ "description": "Latest AutoBench run with models Gpt 5.2, Claude Opus 4.5 and more",
6
+ "blog_url": "https://huggingface.co/blog/PeterKruger/autobench-5th-run",
7
+ "model_count": 35,
8
+ "is_latest": true
9
+ }
runs/run-2025-12-16/p99_latency.csv ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_name,coding,creative writing,current news,general culture,grammar,history,logics,math,science,technology,Average (All Topics)
2
+ Claude-haiku-4.5,303.6944,172.9266,283.9706,514.833,246.103,325.9682,418.1896,361.1314,252.934,289.4724,316.9223
3
+ Claude-opus-4.5,834.1169,210.0999,329.939,377.647,254.7644,382.0941,493.5189,264.7656,281.6953,304.4306,373.3072
4
+ Claude-sonnet-4.5,847.6184,216.3561,502.248,555.5108,406.2059,564.1001,526.6079,428.4763,340.1804,380.1887,476.7493
5
+ DeepSeek-R1-0528,844.7599,99.4614,344.6565,310.4039,362.7391,220.5598,737.7579,1043.4488,548.9713,253.7458,476.6504
6
+ Deepseek-v3.2,631.2052,193.2873,356.0249,313.1029,393.9501,395.7449,541.8973,691.5566,278.3706,309.4961,410.4636
7
+ Deepseek-v3.2-speciale,1311.9582,650.2337,528.4946,503.1869,607.8602,403.1232,1755.1338,1282.7077,701.6598,582.3526,832.6711
8
+ Gemini-2.5-flash,475.328,58.3303,184.8024,139.9909,178.2687,125.5033,154.3759,178.7707,112.877,131.1387,173.9386
9
+ Gemini-2.5-flash-lite,78.696,11.6849,128.7149,38.8007,44.9652,110.2367,86.6394,74.8385,84.655,31.6472,69.0879
10
+ Gemini-2.5-pro,306.2334,63.3816,314.2017,209.8994,160.4949,139.1723,328.0977,268.0161,228.2436,207.0017,222.4742
11
+ Gemini-3-pro-preview,381.7002,98.8641,168.2995,140.8396,157.0393,98.373,342.9425,247.1862,129.185,97.1085,186.1538
12
+ GLM-4.5-Air,945.3131,88.5651,379.6257,236.088,270.8741,253.7883,632.8253,900.4997,274.4039,270.8565,425.284
13
+ GLM-4.6,726.6297,152.2205,537.891,591.0155,502.2694,618.9771,898.3246,1317.976,546.1143,413.4574,630.4876
14
+ Gpt-5.1,991.0433,369.4258,416.6634,430.177,560.7524,468.2914,1079.6459,1077.5916,512.7422,366.9878,627.3321
15
+ Gpt-5.2,983.2315,173.1477,458.684,401.1408,278.51,244.5978,743.5915,574.3997,283.5269,200.6274,434.1457
16
+ Gpt-5.2-pro,1424.653,435.1419,477.5699,586.2364,582.9081,431.5706,1706.3481,1289.0466,425.5168,479.1999,783.8191
17
+ Gpt-5-mini,560.1197,166.9813,239.9502,185.6802,267.7198,200.7012,246.3061,214.9289,269.2209,226.6608,257.8269
18
+ Gpt-5-nano,282.1782,98.8659,238.9069,231.8139,302.9173,185.8154,490.2644,440.4728,259.7014,157.3791,268.8315
19
+ Gpt-oss-120b,309.681,98.5909,308.8411,258.6252,313.478,194.0417,684.8875,180.4313,380.1071,189.748,291.8432
20
+ Gpt-oss-20b,279.1046,37.3872,221.6344,94.4435,235.0365,117.9081,253.5315,225.409,147.6625,218.1006,183.0218
21
+ Grok-4,1010.6335,281.4145,304.5292,211.6889,448.4332,189.3991,1420.3039,1052.7989,377.3943,327.9724,562.4568
22
+ Grok-4.1-fast,49.7465,20.1795,54.4293,78.2517,46.273,49.4893,77.6144,71.8666,48.1681,78.3591,57.4378
23
+ Grok-4.1-fast-thinking,285.175,73.4358,100.4626,138.1754,177.5368,99.1367,516.521,266.009,246.663,167.1038,207.0219
24
+ Kimi-k2-0905,537.4,227.5026,434.3624,300.1772,262.8326,233.5223,537.1856,355.6894,188.4261,212.9576,329.0056
25
+ Kimi-k2-thinking,1063.4055,599.8131,401.0736,574.3742,506.6306,437.6233,1223.912,1507.2595,520.558,456.5078,729.1158
26
+ Llama-3.3-nemotron-super-49b-v1.5,408.0621,271.4401,160.5405,133.0068,155.2934,142.2843,299.6682,423.4154,257.5173,151.9354,240.3163
27
+ Minimax-m2,991.8198,589.4947,238.879,288.4827,400.5846,201.5793,562.5588,746.3536,374.692,333.6929,472.8137
28
+ Ministral-8b-2512,108.0489,233.4817,256.4196,82.3999,54.9958,104.629,423.6525,68.7241,92.101,116.9296,154.1382
29
+ Mistral-large-2512,318.5243,67.4724,277.4699,230.3884,182.5314,212.874,132.5931,141.9202,204.2604,213.2644,198.1299
30
+ Mistral-medium-3.1,101.3717,25.4349,207.9478,212.7246,182.3555,175.8245,108.1197,63.577,166.3996,225.5168,146.9272
31
+ Nemotron-nano-9b-v2,338.6557,195.0248,111.1827,64.8506,178.3646,92.1459,473.7478,272.3648,169.0054,221.995,211.7337
32
+ Nova-2-lite-v1,173.4527,103.964,157.1997,159.4808,133.8659,108.7504,116.0531,119.6456,135.6125,107.9884,131.6013
33
+ Nova-premier-v1,177.9887,35.0321,225.562,188.5122,93.0407,103.8183,83.1841,94.8349,145.6836,197.7514,134.5408
34
+ Qwen3-235b-a22b-2507,388.6708,256.5265,274.0394,224.9808,386.1726,290.7797,524.4854,507.0452,284.5746,234.8702,337.2145
35
+ Qwen3-235B-A22B-Thinking-2507,1366.0363,248.1267,506.3298,540.1404,515.3773,580.0747,1677.411,1660.089,582.4473,432.8225,810.8855
36
+ Qwen3-next-80b-a3b-thinking,497.6184,107.4429,187.0656,154.3664,247.5986,120.2682,189.55,343.7824,279.0175,140.0428,226.6753
runs/run-2025-12-16/summary_data.csv ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Model,Iterations,AutoBench,LMArena,AAI Index,MMLU-Pro,Costs (USD),Avg Answer Duration (sec),P99 Answer Duration (sec),Fail Rate %
2
+ Gpt-5.2-pro,303,4.48,,72,87%,0.8188,261,784,3.81%
3
+ Gpt-5.2,312,4.43,,,,0.0736,130,434,0.95%
4
+ Gemini-3-pro-preview,312,4.41,1492,73,90%,0.0685,76,186,0.95%
5
+ Claude-opus-4.5,313,4.39,1470,70,90%,0.1726,144,373,0.63%
6
+ Gpt-5.1,310,4.38,1457,70,87%,0.1080,227,627,1.59%
7
+ Kimi-k2-thinking,287,4.32,1429,67,85%,0.0186,248,729,8.89%
8
+ Claude-sonnet-4.5,307,4.30,1450,63,88%,0.1139,170,477,2.54%
9
+ Gemini-2.5-pro,313,4.29,1451,60,86%,0.0648,87,222,0.63%
10
+ Gpt-5-mini,312,4.29,1392,64,84%,0.0091,93,258,0.95%
11
+ Grok-4.1-fast-thinking,306,4.21,,64,85%,0.0027,69,207,2.86%
12
+ Grok-4,293,4.20,1478,65,87%,0.0812,180,562,6.98%
13
+ Qwen3-235B-A22B-Thinking-2507,283,4.20,1397,57,84%,0.0032,317,811,10.16%
14
+ Gpt-oss-120b,292,4.18,1352,61,81%,0.0011,75,292,7.30%
15
+ Gemini-2.5-flash,312,4.17,1408,51,84%,0.0212,66,174,0.95%
16
+ Claude-haiku-4.5,312,4.17,1402,55,76%,0.0379,111,317,0.95%
17
+ Deepseek-v3.2-speciale,288,4.14,1418,59,86%,0.0047,310,833,8.57%
18
+ GLM-4.6,306,4.13,1425,56,83%,0.0125,187,630,2.86%
19
+ DeepSeek-R1-0528,308,4.12,1395,52,85%,0.0099,171,477,2.22%
20
+ Deepseek-v3.2,311,4.11,1414,52,84%,0.0009,125,410,1.27%
21
+ Kimi-k2-0905,312,4.11,1416,50,82%,0.0033,83,329,0.95%
22
+ Gpt-5-nano,309,4.06,1339,51,77%,0.0034,100,269,1.90%
23
+ Nova-2-lite-v1,277,4.06,1334,47,81%,0.0394,61,132,12.06%
24
+ Qwen3-next-80b-a3b-thinking,312,4.03,1367,54,82%,0.0075,78,227,0.95%
25
+ Minimax-m2,308,3.99,1345,61,82%,0.0071,137,473,2.22%
26
+ Qwen3-235b-a22b-2507,302,3.98,1374,45,83%,0.0019,105,337,1.31%
27
+ Gemini-2.5-flash-lite,313,3.95,1378,40,81%,0.0021,20,69,0.63%
28
+ Mistral-large-2512,307,3.94,1415,38,81%,0.0051,90,198,2.54%
29
+ Grok-4.1-fast,312,3.88,,38,74%,0.0008,24,57,0.95%
30
+ GLM-4.5-Air,306,3.86,1370,49,82%,0.0054,163,425,2.86%
31
+ Mistral-medium-3.1,306,3.81,1411,35,68%,0.0038,52,147,2.86%
32
+ Llama-3.3-nemotron-super-49b-v1.5,311,3.78,1340,45,81%,0.0018,76,240,1.27%
33
+ Gpt-oss-20b,310,3.78,1318,52,75%,0.0007,39,183,1.59%
34
+ Ministral-8b-2512,306,3.57,,28,64%,0.0005,31,154,2.86%
35
+ Nemotron-nano-9b-v2,311,3.50,,37,74%,0.0008,67,212,1.27%
36
+ Nova-premier-v1,312,3.47,,32,73%,0.0130,52,135,0.95%