Spaces:
Running
Running
Upload from GitHub Actions: updated and cleaned up scripts for new eval runs
Browse files- .github/workflows/nightly-evals.yml +4 -24
- evals/datasets_/mgsm.py +29 -22
- evals/datasets_/mmlu.py +34 -15
- evals/main.py +88 -232
- evals/models.py +3 -35
- evals/tasks.py +69 -82
- languages.json +28 -28
- models.json +10 -976
- results.json +2 -2
.github/workflows/nightly-evals.yml
CHANGED
|
@@ -8,6 +8,7 @@ on:
|
|
| 8 |
jobs:
|
| 9 |
run-evals:
|
| 10 |
runs-on: ubuntu-latest
|
|
|
|
| 11 |
timeout-minutes: 1440 # 24 hours timeout
|
| 12 |
steps:
|
| 13 |
- uses: actions/checkout@v3
|
|
@@ -22,7 +23,7 @@ jobs:
|
|
| 22 |
curl -LsSf https://astral.sh/uv/install.sh | sh
|
| 23 |
uv sync --frozen --extra dev
|
| 24 |
|
| 25 |
-
- name: Run evaluations
|
| 26 |
env:
|
| 27 |
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
| 28 |
HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
|
@@ -31,28 +32,7 @@ jobs:
|
|
| 31 |
run: |
|
| 32 |
uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
| 33 |
uv run evals/download_data.py
|
| 34 |
-
|
| 35 |
-
# Run evaluations with periodic checkpointing
|
| 36 |
-
uv run python -c "
|
| 37 |
-
import time
|
| 38 |
-
import subprocess
|
| 39 |
-
import json
|
| 40 |
-
import os
|
| 41 |
-
|
| 42 |
-
# Check if we have existing results to resume from
|
| 43 |
-
if os.path.exists('results.json'):
|
| 44 |
-
print('Found existing results.json, will resume from checkpoint')
|
| 45 |
-
|
| 46 |
-
# Run the main evaluation
|
| 47 |
-
try:
|
| 48 |
-
subprocess.run(['uv', 'run', 'evals/main.py'], check=True)
|
| 49 |
-
except subprocess.CalledProcessError as e:
|
| 50 |
-
print(f'Evaluation failed: {e}')
|
| 51 |
-
# Save current state even if failed
|
| 52 |
-
if os.path.exists('results.json'):
|
| 53 |
-
print('Saving checkpoint before exit...')
|
| 54 |
-
exit(1)
|
| 55 |
-
"
|
| 56 |
|
| 57 |
- name: Commit changes
|
| 58 |
env:
|
|
@@ -62,7 +42,7 @@ jobs:
|
|
| 62 |
git config --local user.name "github-actions[bot]"
|
| 63 |
git config --local --unset-all http.https://github.com/.extraheader
|
| 64 |
git remote set-url origin https://${GH_PAT}@github.com/datenlabor-bmz/ai-language-monitor.git
|
| 65 |
-
git add results.json models.json languages.json
|
| 66 |
git commit -m "Update evaluation results" || echo "No changes to commit"
|
| 67 |
git push origin HEAD:main
|
| 68 |
|
|
|
|
| 8 |
jobs:
|
| 9 |
run-evals:
|
| 10 |
runs-on: ubuntu-latest
|
| 11 |
+
# checking if this is working in case eval runs take longer than 6h github actions allowance
|
| 12 |
timeout-minutes: 1440 # 24 hours timeout
|
| 13 |
steps:
|
| 14 |
- uses: actions/checkout@v3
|
|
|
|
| 23 |
curl -LsSf https://astral.sh/uv/install.sh | sh
|
| 24 |
uv sync --frozen --extra dev
|
| 25 |
|
| 26 |
+
- name: Run evaluations
|
| 27 |
env:
|
| 28 |
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
| 29 |
HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
|
|
|
| 32 |
run: |
|
| 33 |
uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
| 34 |
uv run evals/download_data.py
|
| 35 |
+
uv run evals/main.py
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
- name: Commit changes
|
| 38 |
env:
|
|
|
|
| 42 |
git config --local user.name "github-actions[bot]"
|
| 43 |
git config --local --unset-all http.https://github.com/.extraheader
|
| 44 |
git remote set-url origin https://${GH_PAT}@github.com/datenlabor-bmz/ai-language-monitor.git
|
| 45 |
+
git add results.json models.json languages.json
|
| 46 |
git commit -m "Update evaluation results" || echo "No changes to commit"
|
| 47 |
git push origin HEAD:main
|
| 48 |
|
evals/datasets_/mgsm.py
CHANGED
|
@@ -3,7 +3,7 @@ import os
|
|
| 3 |
import random
|
| 4 |
|
| 5 |
from datasets import Dataset, load_dataset
|
| 6 |
-
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 7 |
from langcodes import Language, standardize_tag
|
| 8 |
from models import get_google_supported_languages, translate_google
|
| 9 |
from rich import print
|
|
@@ -39,32 +39,39 @@ def parse_number(i):
|
|
| 39 |
return None
|
| 40 |
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
def load_mgsm(language_bcp_47, nr):
|
| 43 |
-
print(f"Loading MGSM data for {language_bcp_47}...")
|
| 44 |
if language_bcp_47 in tags_mgsm.keys():
|
| 45 |
-
|
| 46 |
-
return slug_mgsm,
|
| 47 |
elif language_bcp_47 in tags_afrimgsm.keys():
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
)
|
| 51 |
-
return slug_afrimgsm, ds[nr], "human"
|
| 52 |
elif language_bcp_47 in tags_gsm8kx.keys():
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
subset=tags_gsm8kx[language_bcp_47],
|
| 56 |
-
split="test",
|
| 57 |
-
trust_remote_code=True,
|
| 58 |
-
)[nr]
|
| 59 |
-
row["answer_number"] = row["answer"].split("####")[1].strip()
|
| 60 |
-
return slug_gsm8kx, row, "machine"
|
| 61 |
elif language_bcp_47 in tags_gsm_autotranslated.keys():
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
subset=tags_gsm_autotranslated[language_bcp_47],
|
| 65 |
-
split="test",
|
| 66 |
-
)
|
| 67 |
-
return slug_gsm_autotranslated, ds[nr], "machine"
|
| 68 |
else:
|
| 69 |
return None, None, None
|
| 70 |
|
|
|
|
| 3 |
import random
|
| 4 |
|
| 5 |
from datasets import Dataset, load_dataset
|
| 6 |
+
from datasets_.util import _get_dataset_config_names, _load_dataset, cache
|
| 7 |
from langcodes import Language, standardize_tag
|
| 8 |
from models import get_google_supported_languages, translate_google
|
| 9 |
from rich import print
|
|
|
|
| 39 |
return None
|
| 40 |
|
| 41 |
|
| 42 |
+
@cache
|
| 43 |
+
def _get_mgsm_item(dataset_slug, subset_tag, nr, trust_remote_code=False):
|
| 44 |
+
"""Cache individual MGSM items efficiently"""
|
| 45 |
+
try:
|
| 46 |
+
ds = _load_dataset(dataset_slug, subset=subset_tag, split="test", trust_remote_code=trust_remote_code)
|
| 47 |
+
if nr >= len(ds):
|
| 48 |
+
return None
|
| 49 |
+
|
| 50 |
+
row = ds[nr]
|
| 51 |
+
|
| 52 |
+
# Post-process based on dataset type
|
| 53 |
+
if dataset_slug == slug_gsm8kx:
|
| 54 |
+
row["answer_number"] = row["answer"].split("####")[1].strip()
|
| 55 |
+
|
| 56 |
+
return row
|
| 57 |
+
except Exception:
|
| 58 |
+
# Dataset doesn't exist or doesn't have test split
|
| 59 |
+
return None
|
| 60 |
+
|
| 61 |
+
|
| 62 |
def load_mgsm(language_bcp_47, nr):
|
|
|
|
| 63 |
if language_bcp_47 in tags_mgsm.keys():
|
| 64 |
+
item = _get_mgsm_item(slug_mgsm, tags_mgsm[language_bcp_47], nr)
|
| 65 |
+
return slug_mgsm, item, "human" if item else (None, None, None)
|
| 66 |
elif language_bcp_47 in tags_afrimgsm.keys():
|
| 67 |
+
item = _get_mgsm_item(slug_afrimgsm, tags_afrimgsm[language_bcp_47], nr)
|
| 68 |
+
return slug_afrimgsm, item, "human" if item else (None, None, None)
|
|
|
|
|
|
|
| 69 |
elif language_bcp_47 in tags_gsm8kx.keys():
|
| 70 |
+
item = _get_mgsm_item(slug_gsm8kx, tags_gsm8kx[language_bcp_47], nr, trust_remote_code=True)
|
| 71 |
+
return slug_gsm8kx, item, "machine" if item else (None, None, None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
elif language_bcp_47 in tags_gsm_autotranslated.keys():
|
| 73 |
+
item = _get_mgsm_item(slug_gsm_autotranslated, tags_gsm_autotranslated[language_bcp_47], nr)
|
| 74 |
+
return slug_gsm_autotranslated, item, "machine" if item else (None, None, None)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
else:
|
| 76 |
return None, None, None
|
| 77 |
|
evals/datasets_/mmlu.py
CHANGED
|
@@ -4,7 +4,7 @@ import random
|
|
| 4 |
from collections import Counter, defaultdict
|
| 5 |
|
| 6 |
from datasets import Dataset, load_dataset
|
| 7 |
-
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 8 |
from langcodes import Language, standardize_tag
|
| 9 |
from models import get_google_supported_languages, translate_google
|
| 10 |
from rich import print
|
|
@@ -144,32 +144,51 @@ tags_mmlux = set(
|
|
| 144 |
a.rsplit("_", 1)[1].split("-")[0].lower()
|
| 145 |
for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
|
| 146 |
)
|
| 147 |
-
tags_mmlu_autotranslated =
|
|
|
|
|
|
|
|
|
|
| 148 |
|
| 149 |
categories = sorted(
|
| 150 |
list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
|
| 151 |
)
|
| 152 |
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
async def load_mmlu(language_bcp_47, nr):
|
| 155 |
-
print(f"Loading MMLU data for {language_bcp_47}...")
|
| 156 |
category = categories[nr % len(categories)]
|
| 157 |
if language_bcp_47 in tags_afrimmlu.keys():
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
|
| 161 |
-
return "masakhane/afrimmlu", task, "human"
|
| 162 |
elif language_bcp_47 in tags_global_mmlu.keys():
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
task = ds["test"].filter(lambda x: x["subject"] == category)[nr]
|
| 166 |
-
return "CohereForAI/Global-MMLU", task, "human"
|
| 167 |
# TODO: add in Okapi, MMLUX @Jonas
|
| 168 |
elif language_bcp_47 in tags_mmlu_autotranslated:
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
task = filtered[nr]
|
| 172 |
-
return "fair-forward/mmlu-autotranslated", task, "machine"
|
| 173 |
else:
|
| 174 |
return None, None, None
|
| 175 |
|
|
|
|
| 4 |
from collections import Counter, defaultdict
|
| 5 |
|
| 6 |
from datasets import Dataset, load_dataset
|
| 7 |
+
from datasets_.util import _get_dataset_config_names, _load_dataset, cache
|
| 8 |
from langcodes import Language, standardize_tag
|
| 9 |
from models import get_google_supported_languages, translate_google
|
| 10 |
from rich import print
|
|
|
|
| 144 |
a.rsplit("_", 1)[1].split("-")[0].lower()
|
| 145 |
for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
|
| 146 |
)
|
| 147 |
+
tags_mmlu_autotranslated = {
|
| 148 |
+
standardize_tag(a, macro=True): a
|
| 149 |
+
for a in _get_dataset_config_names("fair-forward/mmlu-autotranslated")
|
| 150 |
+
}
|
| 151 |
|
| 152 |
categories = sorted(
|
| 153 |
list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
|
| 154 |
)
|
| 155 |
|
| 156 |
|
| 157 |
+
@cache
|
| 158 |
+
def _get_processed_mmlu_dataset(dataset_name, subset_tag):
|
| 159 |
+
"""Cache processed datasets to avoid reprocessing"""
|
| 160 |
+
ds = _load_dataset(dataset_name, subset_tag)
|
| 161 |
+
if dataset_name == "masakhane/afrimmlu":
|
| 162 |
+
ds = ds.map(parse_choices)
|
| 163 |
+
elif dataset_name == "CohereForAI/Global-MMLU":
|
| 164 |
+
ds = ds.map(add_choices)
|
| 165 |
+
return ds
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
@cache
|
| 169 |
+
def _get_mmlu_item(dataset_name, subset_tag, category, nr):
|
| 170 |
+
"""Cache individual MMLU items efficiently"""
|
| 171 |
+
ds = _get_processed_mmlu_dataset(dataset_name, subset_tag)
|
| 172 |
+
if dataset_name in ["masakhane/afrimmlu", "CohereForAI/Global-MMLU"]:
|
| 173 |
+
filtered = ds["test"].filter(lambda x: x["subject"] == category)
|
| 174 |
+
return filtered[nr] if nr < len(filtered) else None
|
| 175 |
+
else: # fair-forward/mmlu-autotranslated
|
| 176 |
+
filtered = ds["test"].filter(lambda x: x["subject"] == category)
|
| 177 |
+
return filtered[nr] if nr < len(filtered) else None
|
| 178 |
+
|
| 179 |
+
|
| 180 |
async def load_mmlu(language_bcp_47, nr):
|
|
|
|
| 181 |
category = categories[nr % len(categories)]
|
| 182 |
if language_bcp_47 in tags_afrimmlu.keys():
|
| 183 |
+
task = _get_mmlu_item("masakhane/afrimmlu", tags_afrimmlu[language_bcp_47], category, nr)
|
| 184 |
+
return "masakhane/afrimmlu", task, "human" if task else (None, None, None)
|
|
|
|
|
|
|
| 185 |
elif language_bcp_47 in tags_global_mmlu.keys():
|
| 186 |
+
task = _get_mmlu_item("CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47], category, nr)
|
| 187 |
+
return "CohereForAI/Global-MMLU", task, "human" if task else (None, None, None)
|
|
|
|
|
|
|
| 188 |
# TODO: add in Okapi, MMLUX @Jonas
|
| 189 |
elif language_bcp_47 in tags_mmlu_autotranslated:
|
| 190 |
+
task = _get_mmlu_item("fair-forward/mmlu-autotranslated", language_bcp_47, category, nr)
|
| 191 |
+
return "fair-forward/mmlu-autotranslated", task, "machine" if task else (None, None, None)
|
|
|
|
|
|
|
| 192 |
else:
|
| 193 |
return None, None, None
|
| 194 |
|
evals/main.py
CHANGED
|
@@ -1,271 +1,127 @@
|
|
| 1 |
import asyncio
|
| 2 |
import pandas as pd
|
| 3 |
import time
|
| 4 |
-
import os
|
| 5 |
from datetime import datetime, timedelta
|
| 6 |
-
from tqdm.asyncio import tqdm_asyncio
|
| 7 |
from models import models
|
| 8 |
from tasks import tasks
|
| 9 |
from languages import languages
|
| 10 |
-
import
|
| 11 |
-
|
| 12 |
-
results = pd.DataFrame()
|
| 13 |
-
|
| 14 |
-
def save_checkpoint(results_df, models_df, languages_df, batch_num, total_batches):
|
| 15 |
-
"""Save current progress as checkpoint"""
|
| 16 |
-
try:
|
| 17 |
-
args = dict(orient="records", indent=2, force_ascii=False)
|
| 18 |
-
|
| 19 |
-
# Save current results
|
| 20 |
-
if len(results_df) > 0:
|
| 21 |
-
results_df.to_json("results.json", **args)
|
| 22 |
-
print(f"💾 Checkpoint saved: {len(results_df)} results (batch {batch_num}/{total_batches})")
|
| 23 |
-
|
| 24 |
-
# Save model and language info
|
| 25 |
-
models_df.to_json("models.json", **args)
|
| 26 |
-
languages_df.to_json("languages.json", **args)
|
| 27 |
-
|
| 28 |
-
# Save checkpoint metadata
|
| 29 |
-
checkpoint_info = {
|
| 30 |
-
"last_batch": batch_num,
|
| 31 |
-
"total_batches": total_batches,
|
| 32 |
-
"timestamp": datetime.now().isoformat(),
|
| 33 |
-
"results_count": len(results_df)
|
| 34 |
-
}
|
| 35 |
-
with open("checkpoint.json", "w") as f:
|
| 36 |
-
json.dump(checkpoint_info, f, indent=2)
|
| 37 |
-
|
| 38 |
-
except Exception as e:
|
| 39 |
-
print(f"⚠️ Failed to save checkpoint: {e}")
|
| 40 |
-
|
| 41 |
-
def load_checkpoint():
|
| 42 |
-
"""Load previous checkpoint if available"""
|
| 43 |
-
try:
|
| 44 |
-
if os.path.exists("checkpoint.json"):
|
| 45 |
-
with open("checkpoint.json", "r") as f:
|
| 46 |
-
checkpoint = json.load(f)
|
| 47 |
-
print(f"📂 Found checkpoint from batch {checkpoint['last_batch']}/{checkpoint['total_batches']}")
|
| 48 |
-
return checkpoint
|
| 49 |
-
except Exception as e:
|
| 50 |
-
print(f"⚠️ Failed to load checkpoint: {e}")
|
| 51 |
-
return None
|
| 52 |
|
| 53 |
async def evaluate():
|
| 54 |
-
#
|
| 55 |
-
n_sentences = int(os.environ.get("N_SENTENCES",
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
-
# Load models and languages
|
| 58 |
models_df = pd.DataFrame(models)
|
| 59 |
languages_df = pd.DataFrame(languages)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
|
| 61 |
-
print(f"
|
|
|
|
|
|
|
| 62 |
start_time = time.time()
|
| 63 |
-
print(f"🚀 Starting full evaluation at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 64 |
-
print(f"📊 Evaluating {n_sentences} sentences per task")
|
| 65 |
|
| 66 |
-
#
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
# Load checkpoint if available
|
| 72 |
-
checkpoint = load_checkpoint()
|
| 73 |
-
start_batch = 0
|
| 74 |
-
if checkpoint:
|
| 75 |
-
start_batch = checkpoint['last_batch']
|
| 76 |
-
print(f"🔄 Resuming from batch {start_batch}")
|
| 77 |
-
|
| 78 |
-
# For testing, just use all available languages up to max_languages
|
| 79 |
-
for n_languages in [min(max_languages, len(top_languages))]:
|
| 80 |
-
print(f"running evaluations for {n_languages} languages")
|
| 81 |
-
|
| 82 |
-
# Load existing results
|
| 83 |
try:
|
| 84 |
old_results = pd.read_json("results.json")
|
| 85 |
if old_results.empty:
|
| 86 |
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
|
| 87 |
except FileNotFoundError:
|
| 88 |
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
-
|
| 91 |
-
old_models = pd.read_json("models.json")
|
| 92 |
-
except FileNotFoundError:
|
| 93 |
-
old_models = pd.DataFrame()
|
| 94 |
-
|
| 95 |
-
# get all combinations of model, language and task
|
| 96 |
-
combis = [
|
| 97 |
-
(model, lang.bcp_47, task_name)
|
| 98 |
-
for model in models_df["id"]
|
| 99 |
-
for lang in top_languages.iloc[:n_languages].itertuples()
|
| 100 |
-
for task_name, task in tasks.items()
|
| 101 |
-
if task_name in models_df[models_df["id"] == model]["tasks"].iloc[0]
|
| 102 |
-
]
|
| 103 |
-
# filter out combinations that have already been evaluated
|
| 104 |
-
combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
|
| 105 |
-
combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
|
| 106 |
-
combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
|
| 107 |
-
# run evaluations in batches to prevent HTTP pool exhaustion
|
| 108 |
-
all_tasks = []
|
| 109 |
-
for i in range(n_sentences):
|
| 110 |
-
for model, bcp_47, task_name in combis.itertuples(index=False):
|
| 111 |
-
# All tasks now use the same signature
|
| 112 |
-
all_tasks.append((tasks[task_name], model, bcp_47, i))
|
| 113 |
-
|
| 114 |
-
print(f"⏳ Processing {len(all_tasks)} evaluation tasks in batches...")
|
| 115 |
-
|
| 116 |
-
batch_size = 200 # Process 200 tasks at a time (optimized for GitHub Actions)
|
| 117 |
-
all_results = []
|
| 118 |
-
|
| 119 |
-
# Calculate total batches for progress tracking
|
| 120 |
-
total_batches = (len(all_tasks) + batch_size - 1) // batch_size
|
| 121 |
-
|
| 122 |
-
for i in range(start_batch * batch_size, len(all_tasks), batch_size):
|
| 123 |
-
batch = all_tasks[i:i+batch_size]
|
| 124 |
-
current_batch = i // batch_size + 1
|
| 125 |
-
|
| 126 |
-
print(f"📦 Processing batch {current_batch}/{total_batches} ({len(batch)} tasks)")
|
| 127 |
-
|
| 128 |
-
# Show what's being evaluated in this batch
|
| 129 |
-
batch_summary = {}
|
| 130 |
-
for task_data in batch:
|
| 131 |
-
task_func, model, bcp_47, sentence_nr = task_data
|
| 132 |
-
# Extract task name from function - handle both partial functions and regular functions
|
| 133 |
-
if hasattr(task_func, 'func'):
|
| 134 |
-
task_name = task_func.func.__name__.replace('_and_evaluate', '')
|
| 135 |
-
else:
|
| 136 |
-
task_name = task_func.__name__.replace('_and_evaluate', '')
|
| 137 |
-
|
| 138 |
-
if task_name not in batch_summary:
|
| 139 |
-
batch_summary[task_name] = set()
|
| 140 |
-
batch_summary[task_name].add(bcp_47)
|
| 141 |
-
|
| 142 |
-
for task_name, languages_set in batch_summary.items():
|
| 143 |
-
lang_list = ', '.join(sorted(languages_set))
|
| 144 |
-
print(f" 🔄 {task_name}: {lang_list}")
|
| 145 |
-
|
| 146 |
-
batch_coroutines = []
|
| 147 |
-
for task_data in batch:
|
| 148 |
-
task_func, model, bcp_47, sentence_nr = task_data
|
| 149 |
-
batch_coroutines.append(task_func(model, bcp_47, sentence_nr))
|
| 150 |
-
|
| 151 |
-
try:
|
| 152 |
-
batch_results = await asyncio.gather(*batch_coroutines, return_exceptions=True)
|
| 153 |
-
all_results.extend(batch_results)
|
| 154 |
-
|
| 155 |
-
# Save checkpoint after each batch
|
| 156 |
-
valid_results = []
|
| 157 |
-
exception_count = 0
|
| 158 |
-
for r in batch_results:
|
| 159 |
-
if isinstance(r, Exception):
|
| 160 |
-
exception_count += 1
|
| 161 |
-
continue
|
| 162 |
-
if isinstance(r, list):
|
| 163 |
-
valid_results.extend(r)
|
| 164 |
-
else:
|
| 165 |
-
valid_results.append(r)
|
| 166 |
-
|
| 167 |
-
if valid_results:
|
| 168 |
-
# Aggregate results
|
| 169 |
-
batch_df = pd.DataFrame(valid_results)
|
| 170 |
-
if len(batch_df) > 0:
|
| 171 |
-
batch_df = (
|
| 172 |
-
batch_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
|
| 173 |
-
.agg({"score": "mean"})
|
| 174 |
-
.reset_index()
|
| 175 |
-
)
|
| 176 |
-
# Merge with existing results
|
| 177 |
-
all_results_df = pd.concat([old_results, batch_df])
|
| 178 |
-
all_results_df = all_results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"])
|
| 179 |
-
all_results_df = all_results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
|
| 180 |
-
|
| 181 |
-
# Save checkpoint
|
| 182 |
-
save_checkpoint(all_results_df, models_df, languages_df, current_batch, total_batches)
|
| 183 |
-
|
| 184 |
-
# Update old_results for next batch
|
| 185 |
-
old_results = all_results_df
|
| 186 |
-
|
| 187 |
-
print(f"✅ Batch {current_batch} completed: {len(valid_results)} valid results, {exception_count} errors")
|
| 188 |
-
|
| 189 |
-
except Exception as e:
|
| 190 |
-
print(f"❌ Batch {current_batch} failed: {e}")
|
| 191 |
-
# Save checkpoint even on failure
|
| 192 |
-
if len(all_results) > 0:
|
| 193 |
-
results_df = pd.DataFrame(all_results)
|
| 194 |
-
save_checkpoint(results_df, models_df, languages_df, current_batch, total_batches)
|
| 195 |
-
continue
|
| 196 |
-
|
| 197 |
-
# Reduced delay between batches (optimized for GitHub Actions)
|
| 198 |
-
await asyncio.sleep(0.5)
|
| 199 |
-
|
| 200 |
-
# Final aggregation and save
|
| 201 |
-
results = all_results
|
| 202 |
-
# Filter out exceptions and flatten results
|
| 203 |
valid_results = []
|
| 204 |
-
exception_count = 0
|
| 205 |
for r in results:
|
| 206 |
-
if isinstance(r, Exception):
|
| 207 |
-
exception_count += 1
|
| 208 |
-
continue
|
| 209 |
if isinstance(r, list):
|
| 210 |
valid_results.extend(r)
|
| 211 |
else:
|
| 212 |
valid_results.append(r)
|
|
|
|
|
|
|
| 213 |
|
| 214 |
-
|
| 215 |
-
print(f"
|
| 216 |
-
|
| 217 |
-
|
|
|
|
|
|
|
|
|
|
| 218 |
if valid_results:
|
| 219 |
-
|
| 220 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
.
|
| 228 |
-
.
|
| 229 |
-
|
| 230 |
-
# Merge with old results
|
| 231 |
-
old_results = pd.read_json("results.json")
|
| 232 |
-
results_df = pd.concat([old_results, results_df])
|
| 233 |
-
results_df = results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"])
|
| 234 |
results_df = results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
|
| 235 |
results_df.to_json("results.json", **args)
|
| 236 |
-
|
|
|
|
|
|
|
|
|
|
| 237 |
else:
|
| 238 |
-
print("
|
| 239 |
-
else:
|
| 240 |
-
print("⚠️ No valid results to save - all API calls failed")
|
| 241 |
-
|
| 242 |
-
# Save up-to-date info on models and languages (like main branch)
|
| 243 |
-
all_models = pd.concat([pd.DataFrame(models), old_models])
|
| 244 |
-
all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
|
| 245 |
-
all_models.to_json("models.json", **args)
|
| 246 |
-
pd.DataFrame(languages).to_json("languages.json", **args)
|
| 247 |
|
| 248 |
-
|
| 249 |
-
|
| 250 |
-
# Time estimation
|
| 251 |
-
elapsed = time.time() - start_time
|
| 252 |
-
elapsed_str = str(timedelta(seconds=int(elapsed)))
|
| 253 |
-
if n_languages < max_languages:
|
| 254 |
-
remaining_batches = (max_languages - n_languages) // 10
|
| 255 |
-
batch_count = max(1, n_languages // 10) # Avoid division by zero
|
| 256 |
-
estimated_remaining = elapsed * remaining_batches / batch_count
|
| 257 |
-
eta = datetime.now() + timedelta(seconds=estimated_remaining)
|
| 258 |
-
print(f"⏱️ Batch completed in {elapsed_str}. ETA for full run: {eta.strftime('%H:%M:%S')}")
|
| 259 |
-
else:
|
| 260 |
-
print(f"✅ Full evaluation completed in {elapsed_str}")
|
| 261 |
-
print(f"🎉 Finished at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
|
| 262 |
-
|
| 263 |
-
# Clean up checkpoint file on successful completion
|
| 264 |
-
if os.path.exists("checkpoint.json"):
|
| 265 |
-
os.remove("checkpoint.json")
|
| 266 |
-
print("🧹 Cleaned up checkpoint file")
|
| 267 |
|
| 268 |
-
|
|
|
|
|
|
|
| 269 |
|
| 270 |
|
| 271 |
if __name__ == "__main__":
|
|
|
|
| 1 |
import asyncio
|
| 2 |
import pandas as pd
|
| 3 |
import time
|
|
|
|
| 4 |
from datetime import datetime, timedelta
|
|
|
|
| 5 |
from models import models
|
| 6 |
from tasks import tasks
|
| 7 |
from languages import languages
|
| 8 |
+
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
async def evaluate():
|
| 11 |
+
# Configuration - easily adjustable defaults
|
| 12 |
+
n_sentences = int(os.environ.get("N_SENTENCES", 20)) # Default: 20 sentences per task
|
| 13 |
+
max_languages = int(os.environ.get("MAX_LANGUAGES", 150)) # Default: 150 top languages
|
| 14 |
+
single_model = os.environ.get("SINGLE_MODEL") # Optional: run only one specific model
|
| 15 |
+
test_mode = os.environ.get("TEST", "").lower() in ("1", "true", "yes") # Optional: skip results loading/saving
|
| 16 |
|
|
|
|
| 17 |
models_df = pd.DataFrame(models)
|
| 18 |
languages_df = pd.DataFrame(languages)
|
| 19 |
+
top_languages = languages.head(max_languages)
|
| 20 |
+
|
| 21 |
+
# Filter to single model if specified
|
| 22 |
+
if single_model:
|
| 23 |
+
models_df = models_df[models_df["id"] == single_model]
|
| 24 |
+
if len(models_df) == 0:
|
| 25 |
+
print(f"Error: Model '{single_model}' not found. Available models:")
|
| 26 |
+
for model_id in pd.DataFrame(models)["id"]:
|
| 27 |
+
print(f" {model_id}")
|
| 28 |
+
return pd.DataFrame()
|
| 29 |
|
| 30 |
+
print(f"Starting evaluation: {len(models_df)} models, {len(top_languages)} languages, {n_sentences} sentences per task")
|
| 31 |
+
if test_mode:
|
| 32 |
+
print("TEST MODE: Skipping results loading/saving")
|
| 33 |
start_time = time.time()
|
|
|
|
|
|
|
| 34 |
|
| 35 |
+
# Load existing results to avoid re-evaluation (skip in test mode)
|
| 36 |
+
if test_mode:
|
| 37 |
+
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
|
| 38 |
+
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
try:
|
| 40 |
old_results = pd.read_json("results.json")
|
| 41 |
if old_results.empty:
|
| 42 |
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
|
| 43 |
except FileNotFoundError:
|
| 44 |
old_results = pd.DataFrame(columns=["model", "bcp_47", "task", "metric", "origin", "score"])
|
| 45 |
+
|
| 46 |
+
# Get all combinations that need evaluation
|
| 47 |
+
combis = [
|
| 48 |
+
(model, lang.bcp_47, task_name)
|
| 49 |
+
for model in models_df["id"]
|
| 50 |
+
for lang in top_languages.itertuples()
|
| 51 |
+
for task_name, task in tasks.items()
|
| 52 |
+
if task_name in models_df[models_df["id"] == model]["tasks"].iloc[0]
|
| 53 |
+
]
|
| 54 |
+
|
| 55 |
+
# Filter out already evaluated combinations
|
| 56 |
+
combis = pd.DataFrame(combis, columns=["model", "bcp_47", "task"])
|
| 57 |
+
combis = combis.merge(old_results, on=["model", "bcp_47", "task"], how="left")
|
| 58 |
+
combis = combis[combis["metric"].isna()][["model", "bcp_47", "task"]]
|
| 59 |
+
|
| 60 |
+
# Create all evaluation tasks
|
| 61 |
+
all_tasks = []
|
| 62 |
+
for i in range(n_sentences):
|
| 63 |
+
for model, bcp_47, task_name in combis.itertuples(index=False):
|
| 64 |
+
all_tasks.append((tasks[task_name], model, bcp_47, i))
|
| 65 |
+
|
| 66 |
+
print(f"Running {len(all_tasks)} evaluation tasks...")
|
| 67 |
+
|
| 68 |
+
# Run all tasks with simple asyncio.gather, but stop on first error
|
| 69 |
+
try:
|
| 70 |
+
results = await asyncio.gather(
|
| 71 |
+
*[task_func(model, bcp_47, sentence_nr) for task_func, model, bcp_47, sentence_nr in all_tasks],
|
| 72 |
+
return_exceptions=False # This will raise on first exception
|
| 73 |
+
)
|
| 74 |
|
| 75 |
+
# Process results - no exceptions should reach here
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
valid_results = []
|
|
|
|
| 77 |
for r in results:
|
|
|
|
|
|
|
|
|
|
| 78 |
if isinstance(r, list):
|
| 79 |
valid_results.extend(r)
|
| 80 |
else:
|
| 81 |
valid_results.append(r)
|
| 82 |
+
|
| 83 |
+
print(f"Completed: {len(valid_results)} valid results")
|
| 84 |
|
| 85 |
+
except Exception as e:
|
| 86 |
+
print(f"EVALUATION STOPPED - API Error occurred:")
|
| 87 |
+
print(f"Error type: {type(e).__name__}")
|
| 88 |
+
print(f"Error message: {str(e)}")
|
| 89 |
+
return pd.DataFrame()
|
| 90 |
+
|
| 91 |
+
# Save results (skip in test mode)
|
| 92 |
if valid_results:
|
| 93 |
+
results_df = pd.DataFrame(valid_results)
|
| 94 |
+
|
| 95 |
+
# Aggregate results
|
| 96 |
+
results_df = (
|
| 97 |
+
results_df.groupby(["model", "bcp_47", "task", "metric", "origin"])
|
| 98 |
+
.agg({"score": "mean"})
|
| 99 |
+
.reset_index()
|
| 100 |
+
)
|
| 101 |
|
| 102 |
+
if not test_mode:
|
| 103 |
+
args = dict(orient="records", indent=2, force_ascii=False)
|
| 104 |
+
|
| 105 |
+
# Merge with existing results
|
| 106 |
+
if not old_results.empty:
|
| 107 |
+
results_df = pd.concat([old_results, results_df])
|
| 108 |
+
results_df = results_df.drop_duplicates(subset=["model", "bcp_47", "task", "metric", "origin"])
|
| 109 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
results_df = results_df.sort_values(by=["model", "bcp_47", "task", "metric"])
|
| 111 |
results_df.to_json("results.json", **args)
|
| 112 |
+
|
| 113 |
+
# Save model and language info
|
| 114 |
+
models_df.to_json("models.json", **args)
|
| 115 |
+
languages_df.to_json("languages.json", **args)
|
| 116 |
else:
|
| 117 |
+
print("TEST MODE: Skipping results saving")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
+
elapsed = time.time() - start_time
|
| 120 |
+
print(f"Evaluation completed in {str(timedelta(seconds=int(elapsed)))}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 121 |
|
| 122 |
+
return results_df
|
| 123 |
+
|
| 124 |
+
return pd.DataFrame()
|
| 125 |
|
| 126 |
|
| 127 |
if __name__ == "__main__":
|
evals/models.py
CHANGED
|
@@ -27,7 +27,8 @@ important_models = [
|
|
| 27 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$
|
| 28 |
"meta-llama/llama-3-70b-instruct", # 0.4$
|
| 29 |
# "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
|
| 30 |
-
"openai/gpt-5",
|
|
|
|
| 31 |
"openai/gpt-4.1", # 8$
|
| 32 |
"openai/gpt-4.1-mini", # 1.6$
|
| 33 |
"openai/gpt-4.1-nano", # 0.4$
|
|
@@ -96,9 +97,6 @@ def get_model(permaslug):
|
|
| 96 |
and m["endpoint"]
|
| 97 |
and not m["endpoint"]["is_free"]
|
| 98 |
]
|
| 99 |
-
if len(slugs) == 0:
|
| 100 |
-
# the problem is that free models typically have very high rate-limiting
|
| 101 |
-
print(f"no non-free model found for {permaslug}")
|
| 102 |
return slugs[0] if len(slugs) >= 1 else None
|
| 103 |
|
| 104 |
|
|
@@ -132,18 +130,11 @@ def get_historical_popular_models(date: date):
|
|
| 132 |
for model_slug, count in sorted_models[:20]: # Top 20
|
| 133 |
result.append({"slug": model_slug, "count": int(count)})
|
| 134 |
|
| 135 |
-
print(f"✅ Historical OpenRouter models: {len(result)} models fetched")
|
| 136 |
-
if result:
|
| 137 |
-
print(f" Top 5: {[m['slug'] for m in result[:5]]}")
|
| 138 |
-
print(f" Sample counts: {[m['count'] for m in result[:3]]}")
|
| 139 |
return result
|
| 140 |
else:
|
| 141 |
-
print("⚠️ Could not find model ranking data in OpenRouter response")
|
| 142 |
return []
|
| 143 |
|
| 144 |
except Exception as e:
|
| 145 |
-
print(f"⚠️ Error fetching OpenRouter historical rankings: {e}")
|
| 146 |
-
print("🔄 Falling back to static model list")
|
| 147 |
return []
|
| 148 |
|
| 149 |
|
|
@@ -176,18 +167,11 @@ def get_current_popular_models(date: date):
|
|
| 176 |
for model_slug, count in sorted_models[:10]: # Top 10
|
| 177 |
result.append({"slug": model_slug, "count": int(count)})
|
| 178 |
|
| 179 |
-
print(f"✅ Current OpenRouter models: {len(result)} models fetched")
|
| 180 |
-
if result:
|
| 181 |
-
print(f" Top 5: {[m['slug'] for m in result[:5]]}")
|
| 182 |
-
print(f" Sample counts: {[m['count'] for m in result[:3]]}")
|
| 183 |
return result
|
| 184 |
else:
|
| 185 |
-
print("⚠️ Could not find daily ranking data in OpenRouter response")
|
| 186 |
return []
|
| 187 |
|
| 188 |
except Exception as e:
|
| 189 |
-
print(f"⚠️ Error fetching OpenRouter current rankings: {e}")
|
| 190 |
-
print("🔄 Falling back to static model list")
|
| 191 |
return []
|
| 192 |
|
| 193 |
|
|
@@ -244,16 +228,13 @@ async def complete(**kwargs) -> str | None:
|
|
| 244 |
return None
|
| 245 |
raise e
|
| 246 |
except asyncio.TimeoutError:
|
| 247 |
-
print(f"⏰ Timeout after {timeout}s for model {model_id}")
|
| 248 |
return None
|
| 249 |
if not response.choices:
|
| 250 |
raise Exception(response)
|
| 251 |
return response.choices[0].message.content.strip()
|
| 252 |
|
| 253 |
-
|
| 254 |
translate_client = None
|
| 255 |
|
| 256 |
-
|
| 257 |
def get_google_translate_client():
|
| 258 |
global translate_client
|
| 259 |
if translate_client is None:
|
|
@@ -364,7 +345,7 @@ def get_cost(row):
|
|
| 364 |
return None
|
| 365 |
|
| 366 |
|
| 367 |
-
|
| 368 |
def load_models(date: date):
|
| 369 |
popular_models = (
|
| 370 |
get_historical_popular_models(date.today())[:20]
|
|
@@ -374,25 +355,12 @@ def load_models(date: date):
|
|
| 374 |
all_model_candidates = set(important_models + popular_models) - set(blocklist)
|
| 375 |
|
| 376 |
# Validate models exist on OpenRouter before including them
|
| 377 |
-
print(f"🔍 Validating {len(all_model_candidates)} model candidates...")
|
| 378 |
valid_models = []
|
| 379 |
-
invalid_models = []
|
| 380 |
|
| 381 |
for model_id in all_model_candidates:
|
| 382 |
metadata = get_or_metadata(model_id)
|
| 383 |
if metadata is not None:
|
| 384 |
valid_models.append(model_id)
|
| 385 |
-
else:
|
| 386 |
-
invalid_models.append(model_id)
|
| 387 |
-
|
| 388 |
-
if invalid_models:
|
| 389 |
-
print(f"⚠️ Excluded {len(invalid_models)} invalid models:")
|
| 390 |
-
for model in sorted(invalid_models)[:5]: # Show first 5
|
| 391 |
-
print(f" - {model}")
|
| 392 |
-
if len(invalid_models) > 5:
|
| 393 |
-
print(f" ... and {len(invalid_models) - 5} more")
|
| 394 |
-
|
| 395 |
-
print(f"✅ Using {len(valid_models)} valid models for evaluation")
|
| 396 |
|
| 397 |
models = pd.DataFrame(sorted(valid_models), columns=["id"])
|
| 398 |
or_metadata = models["id"].apply(get_or_metadata)
|
|
|
|
| 27 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$
|
| 28 |
"meta-llama/llama-3-70b-instruct", # 0.4$
|
| 29 |
# "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
|
| 30 |
+
"openai/gpt-5",
|
| 31 |
+
"openai/gpt-5-nano", # include if/when available
|
| 32 |
"openai/gpt-4.1", # 8$
|
| 33 |
"openai/gpt-4.1-mini", # 1.6$
|
| 34 |
"openai/gpt-4.1-nano", # 0.4$
|
|
|
|
| 97 |
and m["endpoint"]
|
| 98 |
and not m["endpoint"]["is_free"]
|
| 99 |
]
|
|
|
|
|
|
|
|
|
|
| 100 |
return slugs[0] if len(slugs) >= 1 else None
|
| 101 |
|
| 102 |
|
|
|
|
| 130 |
for model_slug, count in sorted_models[:20]: # Top 20
|
| 131 |
result.append({"slug": model_slug, "count": int(count)})
|
| 132 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 133 |
return result
|
| 134 |
else:
|
|
|
|
| 135 |
return []
|
| 136 |
|
| 137 |
except Exception as e:
|
|
|
|
|
|
|
| 138 |
return []
|
| 139 |
|
| 140 |
|
|
|
|
| 167 |
for model_slug, count in sorted_models[:10]: # Top 10
|
| 168 |
result.append({"slug": model_slug, "count": int(count)})
|
| 169 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
return result
|
| 171 |
else:
|
|
|
|
| 172 |
return []
|
| 173 |
|
| 174 |
except Exception as e:
|
|
|
|
|
|
|
| 175 |
return []
|
| 176 |
|
| 177 |
|
|
|
|
| 228 |
return None
|
| 229 |
raise e
|
| 230 |
except asyncio.TimeoutError:
|
|
|
|
| 231 |
return None
|
| 232 |
if not response.choices:
|
| 233 |
raise Exception(response)
|
| 234 |
return response.choices[0].message.content.strip()
|
| 235 |
|
|
|
|
| 236 |
translate_client = None
|
| 237 |
|
|
|
|
| 238 |
def get_google_translate_client():
|
| 239 |
global translate_client
|
| 240 |
if translate_client is None:
|
|
|
|
| 345 |
return None
|
| 346 |
|
| 347 |
|
| 348 |
+
#@cache
|
| 349 |
def load_models(date: date):
|
| 350 |
popular_models = (
|
| 351 |
get_historical_popular_models(date.today())[:20]
|
|
|
|
| 355 |
all_model_candidates = set(important_models + popular_models) - set(blocklist)
|
| 356 |
|
| 357 |
# Validate models exist on OpenRouter before including them
|
|
|
|
| 358 |
valid_models = []
|
|
|
|
| 359 |
|
| 360 |
for model_id in all_model_candidates:
|
| 361 |
metadata = get_or_metadata(model_id)
|
| 362 |
if metadata is not None:
|
| 363 |
valid_models.append(model_id)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
|
| 365 |
models = pd.DataFrame(sorted(valid_models), columns=["id"])
|
| 366 |
or_metadata = models["id"].apply(get_or_metadata)
|
evals/tasks.py
CHANGED
|
@@ -11,10 +11,8 @@ from datasets_.mgsm import load_mgsm, parse_number
|
|
| 11 |
from datasets_.mmlu import load_mmlu
|
| 12 |
from datasets_.arc import load_uhura_arc_easy
|
| 13 |
from datasets_.truthfulqa import load_truthfulqa
|
| 14 |
-
from google.cloud import translate_v2 as translate
|
| 15 |
-
from langcodes import closest_supported_match
|
| 16 |
from languages import languages, script_name
|
| 17 |
-
from models import complete, transcribe
|
| 18 |
|
| 19 |
bleu = evaluate.load("bleu")
|
| 20 |
chrf = evaluate.load("chrf")
|
|
@@ -45,32 +43,20 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
| 45 |
original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
|
| 46 |
target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
|
| 47 |
script = script_name(target_language.flores_path.split("_")[1])
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
else:
|
| 63 |
-
prediction = await complete(
|
| 64 |
-
model=model,
|
| 65 |
-
messages=[
|
| 66 |
-
{
|
| 67 |
-
"role": "user",
|
| 68 |
-
"content": f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}",
|
| 69 |
-
}
|
| 70 |
-
],
|
| 71 |
-
temperature=0,
|
| 72 |
-
max_tokens=1024,
|
| 73 |
-
)
|
| 74 |
if prediction:
|
| 75 |
bleu_score = bleu.compute(
|
| 76 |
predictions=[prediction],
|
|
@@ -83,6 +69,9 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
| 83 |
else:
|
| 84 |
bleu_score = {"bleu": 0}
|
| 85 |
chrf_score = {"score": 0}
|
|
|
|
|
|
|
|
|
|
| 86 |
return [
|
| 87 |
{
|
| 88 |
"model": model,
|
|
@@ -120,12 +109,16 @@ Reply with only the topic name.
|
|
| 120 |
Text:
|
| 121 |
{test_paragraph.text}
|
| 122 |
"""
|
| 123 |
-
|
| 124 |
model=model,
|
| 125 |
messages=[{"role": "user", "content": prompt}],
|
| 126 |
temperature=0,
|
| 127 |
max_tokens=30,
|
| 128 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
true = test_paragraph.topic.lower().strip()
|
| 130 |
others = [t for t in top_topics if t != true]
|
| 131 |
acc = (
|
|
@@ -136,6 +129,8 @@ Text:
|
|
| 136 |
if pred
|
| 137 |
else 0
|
| 138 |
)
|
|
|
|
|
|
|
| 139 |
return [
|
| 140 |
{
|
| 141 |
"model": model,
|
|
@@ -228,23 +223,20 @@ Response format: <reasoning> #### <letter>
|
|
| 228 |
{format_multiple_choice(task)}""",
|
| 229 |
},
|
| 230 |
]
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
| 237 |
-
|
| 238 |
-
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
|
| 245 |
-
acc = 0
|
| 246 |
-
else:
|
| 247 |
-
raise e
|
| 248 |
|
| 249 |
return [
|
| 250 |
{
|
|
@@ -276,23 +268,18 @@ Response format: <reasoning> #### <letter>
|
|
| 276 |
{format_multiple_choice(task)}""",
|
| 277 |
},
|
| 278 |
]
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
except Exception as e:
|
| 292 |
-
if "ResponsibleAIPolicyViolation" in str(e):
|
| 293 |
-
acc = 0
|
| 294 |
-
else:
|
| 295 |
-
raise e
|
| 296 |
return [
|
| 297 |
{
|
| 298 |
"model": model,
|
|
@@ -349,23 +336,20 @@ Response format: <reasoning> #### <letter>
|
|
| 349 |
{format_multiple_choice_truthfulqa(task)}""",
|
| 350 |
},
|
| 351 |
]
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
acc = 0
|
| 367 |
-
else:
|
| 368 |
-
raise e
|
| 369 |
return [
|
| 370 |
{
|
| 371 |
"model": model,
|
|
@@ -407,6 +391,9 @@ Response format: <reasoning> #### <number>
|
|
| 407 |
accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
|
| 408 |
else:
|
| 409 |
accuracy = 0
|
|
|
|
|
|
|
|
|
|
| 410 |
|
| 411 |
return [
|
| 412 |
{
|
|
|
|
| 11 |
from datasets_.mmlu import load_mmlu
|
| 12 |
from datasets_.arc import load_uhura_arc_easy
|
| 13 |
from datasets_.truthfulqa import load_truthfulqa
|
|
|
|
|
|
|
| 14 |
from languages import languages, script_name
|
| 15 |
+
from models import complete, transcribe
|
| 16 |
|
| 17 |
bleu = evaluate.load("bleu")
|
| 18 |
chrf = evaluate.load("chrf")
|
|
|
|
| 43 |
original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
|
| 44 |
target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
|
| 45 |
script = script_name(target_language.flores_path.split("_")[1])
|
| 46 |
+
translation_prompt = f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}"
|
| 47 |
+
prediction = await complete(
|
| 48 |
+
model=model,
|
| 49 |
+
messages=[
|
| 50 |
+
{
|
| 51 |
+
"role": "user",
|
| 52 |
+
"content": translation_prompt,
|
| 53 |
+
}
|
| 54 |
+
],
|
| 55 |
+
temperature=0,
|
| 56 |
+
max_tokens=1024,
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 60 |
if prediction:
|
| 61 |
bleu_score = bleu.compute(
|
| 62 |
predictions=[prediction],
|
|
|
|
| 69 |
else:
|
| 70 |
bleu_score = {"bleu": 0}
|
| 71 |
chrf_score = {"score": 0}
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
|
| 75 |
return [
|
| 76 |
{
|
| 77 |
"model": model,
|
|
|
|
| 109 |
Text:
|
| 110 |
{test_paragraph.text}
|
| 111 |
"""
|
| 112 |
+
response = await complete(
|
| 113 |
model=model,
|
| 114 |
messages=[{"role": "user", "content": prompt}],
|
| 115 |
temperature=0,
|
| 116 |
max_tokens=30,
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
pred = response.lower().strip() if response else ""
|
| 122 |
true = test_paragraph.topic.lower().strip()
|
| 123 |
others = [t for t in top_topics if t != true]
|
| 124 |
acc = (
|
|
|
|
| 129 |
if pred
|
| 130 |
else 0
|
| 131 |
)
|
| 132 |
+
|
| 133 |
+
|
| 134 |
return [
|
| 135 |
{
|
| 136 |
"model": model,
|
|
|
|
| 223 |
{format_multiple_choice(task)}""",
|
| 224 |
},
|
| 225 |
]
|
| 226 |
+
response = await complete(
|
| 227 |
+
model=model,
|
| 228 |
+
messages=messages,
|
| 229 |
+
temperature=0,
|
| 230 |
+
max_tokens=1024,
|
| 231 |
+
)
|
| 232 |
+
if response and "####" in response:
|
| 233 |
+
answer = response.split("####")[-1].strip()
|
| 234 |
+
acc = int(answer[:1] == task["answer"])
|
| 235 |
+
else:
|
| 236 |
+
acc = 0
|
| 237 |
+
answer = "NO_ANSWER"
|
| 238 |
+
|
| 239 |
+
|
|
|
|
|
|
|
|
|
|
| 240 |
|
| 241 |
return [
|
| 242 |
{
|
|
|
|
| 268 |
{format_multiple_choice(task)}""",
|
| 269 |
},
|
| 270 |
]
|
| 271 |
+
response = await complete(
|
| 272 |
+
model=model,
|
| 273 |
+
messages=messages,
|
| 274 |
+
temperature=0,
|
| 275 |
+
max_tokens=1024,
|
| 276 |
+
)
|
| 277 |
+
if response and "####" in response:
|
| 278 |
+
answer = response.split("####")[-1].strip()
|
| 279 |
+
acc = int(answer[:1] == task["answer"])
|
| 280 |
+
else:
|
| 281 |
+
acc = 0
|
| 282 |
+
answer = "NO_ANSWER"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
return [
|
| 284 |
{
|
| 285 |
"model": model,
|
|
|
|
| 336 |
{format_multiple_choice_truthfulqa(task)}""",
|
| 337 |
},
|
| 338 |
]
|
| 339 |
+
response = await complete(
|
| 340 |
+
model=model,
|
| 341 |
+
messages=messages,
|
| 342 |
+
temperature=0,
|
| 343 |
+
max_tokens=1024, # Increased for reasoning
|
| 344 |
+
)
|
| 345 |
+
if response and "####" in response:
|
| 346 |
+
pred_answer = response.split("####")[-1].strip()
|
| 347 |
+
acc = int(pred_answer[:1].upper() == answer)
|
| 348 |
+
else:
|
| 349 |
+
acc = 0
|
| 350 |
+
pred_answer = "NO_ANSWER"
|
| 351 |
+
|
| 352 |
+
|
|
|
|
|
|
|
|
|
|
| 353 |
return [
|
| 354 |
{
|
| 355 |
"model": model,
|
|
|
|
| 391 |
accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
|
| 392 |
else:
|
| 393 |
accuracy = 0
|
| 394 |
+
number = "NO_ANSWER"
|
| 395 |
+
|
| 396 |
+
|
| 397 |
|
| 398 |
return [
|
| 399 |
{
|
languages.json
CHANGED
|
@@ -7,7 +7,7 @@
|
|
| 7 |
"family":"Indo-European",
|
| 8 |
"flores_path":"eng_Latn",
|
| 9 |
"fleurs_tag":"en_us",
|
| 10 |
-
"commonvoice_hours":
|
| 11 |
"commonvoice_locale":"en",
|
| 12 |
"in_benchmark":true
|
| 13 |
},
|
|
@@ -32,7 +32,7 @@
|
|
| 32 |
"flores_path":"hin_Deva",
|
| 33 |
"fleurs_tag":"hi_in",
|
| 34 |
"commonvoice_hours":16.0,
|
| 35 |
-
"commonvoice_locale":"hi
|
| 36 |
"in_benchmark":true
|
| 37 |
},
|
| 38 |
{
|
|
@@ -43,7 +43,7 @@
|
|
| 43 |
"family":"Indo-European",
|
| 44 |
"flores_path":"spa_Latn",
|
| 45 |
"fleurs_tag":"es_419",
|
| 46 |
-
"commonvoice_hours":
|
| 47 |
"commonvoice_locale":"es",
|
| 48 |
"in_benchmark":true
|
| 49 |
},
|
|
@@ -79,7 +79,7 @@
|
|
| 79 |
"family":"Indo-European",
|
| 80 |
"flores_path":"fra_Latn",
|
| 81 |
"fleurs_tag":"fr_fr",
|
| 82 |
-
"commonvoice_hours":
|
| 83 |
"commonvoice_locale":"fr",
|
| 84 |
"in_benchmark":true
|
| 85 |
},
|
|
@@ -127,7 +127,7 @@
|
|
| 127 |
"family":"Indo-European",
|
| 128 |
"flores_path":"rus_Cyrl",
|
| 129 |
"fleurs_tag":"ru_ru",
|
| 130 |
-
"commonvoice_hours":
|
| 131 |
"commonvoice_locale":"ru",
|
| 132 |
"in_benchmark":true
|
| 133 |
},
|
|
@@ -139,7 +139,7 @@
|
|
| 139 |
"family":"Atlantic-Congo",
|
| 140 |
"flores_path":"swh_Latn",
|
| 141 |
"fleurs_tag":"sw_ke",
|
| 142 |
-
"commonvoice_hours":
|
| 143 |
"commonvoice_locale":"sw",
|
| 144 |
"in_benchmark":true
|
| 145 |
},
|
|
@@ -163,7 +163,7 @@
|
|
| 163 |
"family":"Indo-European",
|
| 164 |
"flores_path":"deu_Latn",
|
| 165 |
"fleurs_tag":"de_de",
|
| 166 |
-
"commonvoice_hours":
|
| 167 |
"commonvoice_locale":"de",
|
| 168 |
"in_benchmark":true
|
| 169 |
},
|
|
@@ -1027,7 +1027,7 @@
|
|
| 1027 |
"family":"Uralic",
|
| 1028 |
"flores_path":"hun_Latn",
|
| 1029 |
"fleurs_tag":"hu_hu",
|
| 1030 |
-
"commonvoice_hours":
|
| 1031 |
"commonvoice_locale":"hu",
|
| 1032 |
"in_benchmark":true
|
| 1033 |
},
|
|
@@ -1183,7 +1183,7 @@
|
|
| 1183 |
"family":"Indo-European",
|
| 1184 |
"flores_path":"bel_Cyrl",
|
| 1185 |
"fleurs_tag":"be_by",
|
| 1186 |
-
"commonvoice_hours":
|
| 1187 |
"commonvoice_locale":"be",
|
| 1188 |
"in_benchmark":true
|
| 1189 |
},
|
|
@@ -1207,7 +1207,7 @@
|
|
| 1207 |
"family":"Indo-European",
|
| 1208 |
"flores_path":"tgk_Cyrl",
|
| 1209 |
"fleurs_tag":"tg_tj",
|
| 1210 |
-
"commonvoice_hours":0.
|
| 1211 |
"commonvoice_locale":"tg",
|
| 1212 |
"in_benchmark":true
|
| 1213 |
},
|
|
@@ -1291,7 +1291,7 @@
|
|
| 1291 |
"family":"Indo-European",
|
| 1292 |
"flores_path":"cat_Latn",
|
| 1293 |
"fleurs_tag":"ca_es",
|
| 1294 |
-
"commonvoice_hours":
|
| 1295 |
"commonvoice_locale":"ca",
|
| 1296 |
"in_benchmark":true
|
| 1297 |
},
|
|
@@ -1303,7 +1303,7 @@
|
|
| 1303 |
"family":"Afro-Asiatic",
|
| 1304 |
"flores_path":"heb_Hebr",
|
| 1305 |
"fleurs_tag":"he_il",
|
| 1306 |
-
"commonvoice_hours":
|
| 1307 |
"commonvoice_locale":"he",
|
| 1308 |
"in_benchmark":true
|
| 1309 |
},
|
|
@@ -1375,7 +1375,7 @@
|
|
| 1375 |
"family":"Turkic",
|
| 1376 |
"flores_path":"uig_Arab",
|
| 1377 |
"fleurs_tag":null,
|
| 1378 |
-
"commonvoice_hours":
|
| 1379 |
"commonvoice_locale":"ug",
|
| 1380 |
"in_benchmark":true
|
| 1381 |
},
|
|
@@ -1519,7 +1519,7 @@
|
|
| 1519 |
"family":"Indo-European",
|
| 1520 |
"flores_path":"kmr_Latn",
|
| 1521 |
"fleurs_tag":null,
|
| 1522 |
-
"commonvoice_hours":
|
| 1523 |
"commonvoice_locale":"kmr",
|
| 1524 |
"in_benchmark":true
|
| 1525 |
},
|
|
@@ -1555,7 +1555,7 @@
|
|
| 1555 |
"family":"Indo-European",
|
| 1556 |
"flores_path":"slk_Latn",
|
| 1557 |
"fleurs_tag":"sk_sk",
|
| 1558 |
-
"commonvoice_hours":
|
| 1559 |
"commonvoice_locale":"sk",
|
| 1560 |
"in_benchmark":true
|
| 1561 |
},
|
|
@@ -1675,7 +1675,7 @@
|
|
| 1675 |
"family":"Tupian",
|
| 1676 |
"flores_path":"gug_Latn",
|
| 1677 |
"fleurs_tag":null,
|
| 1678 |
-
"commonvoice_hours":4.
|
| 1679 |
"commonvoice_locale":"gn",
|
| 1680 |
"in_benchmark":true
|
| 1681 |
},
|
|
@@ -1747,7 +1747,7 @@
|
|
| 1747 |
"family":"Indo-European",
|
| 1748 |
"flores_path":"nob_Latn",
|
| 1749 |
"fleurs_tag":"nb_no",
|
| 1750 |
-
"commonvoice_hours":1.
|
| 1751 |
"commonvoice_locale":"nb-NO",
|
| 1752 |
"in_benchmark":true
|
| 1753 |
},
|
|
@@ -2167,7 +2167,7 @@
|
|
| 2167 |
"family":"Indo-European",
|
| 2168 |
"flores_path":"glg_Latn",
|
| 2169 |
"fleurs_tag":"gl_es",
|
| 2170 |
-
"commonvoice_hours":
|
| 2171 |
"commonvoice_locale":"gl",
|
| 2172 |
"in_benchmark":true
|
| 2173 |
},
|
|
@@ -3175,8 +3175,8 @@
|
|
| 3175 |
"family":"Atlantic-Congo",
|
| 3176 |
"flores_path":null,
|
| 3177 |
"fleurs_tag":null,
|
| 3178 |
-
"commonvoice_hours":
|
| 3179 |
-
"commonvoice_locale":
|
| 3180 |
"in_benchmark":false
|
| 3181 |
},
|
| 3182 |
{
|
|
@@ -3331,7 +3331,7 @@
|
|
| 3331 |
"family":"Indo-European",
|
| 3332 |
"flores_path":"gle_Latn",
|
| 3333 |
"fleurs_tag":"ga_ie",
|
| 3334 |
-
"commonvoice_hours":9.
|
| 3335 |
"commonvoice_locale":"ga-IE",
|
| 3336 |
"in_benchmark":true
|
| 3337 |
},
|
|
@@ -3535,7 +3535,7 @@
|
|
| 3535 |
"family":null,
|
| 3536 |
"flores_path":"eus_Latn",
|
| 3537 |
"fleurs_tag":null,
|
| 3538 |
-
"commonvoice_hours":
|
| 3539 |
"commonvoice_locale":"eu",
|
| 3540 |
"in_benchmark":true
|
| 3541 |
},
|
|
@@ -3559,7 +3559,7 @@
|
|
| 3559 |
"family":"Abkhaz-Adyge",
|
| 3560 |
"flores_path":null,
|
| 3561 |
"fleurs_tag":null,
|
| 3562 |
-
"commonvoice_hours":
|
| 3563 |
"commonvoice_locale":"kbd",
|
| 3564 |
"in_benchmark":false
|
| 3565 |
},
|
|
@@ -3679,7 +3679,7 @@
|
|
| 3679 |
"family":"Indo-European",
|
| 3680 |
"flores_path":"ydd_Hebr",
|
| 3681 |
"fleurs_tag":null,
|
| 3682 |
-
"commonvoice_hours":1.
|
| 3683 |
"commonvoice_locale":"yi",
|
| 3684 |
"in_benchmark":true
|
| 3685 |
},
|
|
@@ -4099,8 +4099,8 @@
|
|
| 4099 |
"family":"Indo-European",
|
| 4100 |
"flores_path":null,
|
| 4101 |
"fleurs_tag":null,
|
| 4102 |
-
"commonvoice_hours":
|
| 4103 |
-
"commonvoice_locale":
|
| 4104 |
"in_benchmark":false
|
| 4105 |
},
|
| 4106 |
{
|
|
@@ -4651,7 +4651,7 @@
|
|
| 4651 |
"family":"Abkhaz-Adyge",
|
| 4652 |
"flores_path":null,
|
| 4653 |
"fleurs_tag":null,
|
| 4654 |
-
"commonvoice_hours":
|
| 4655 |
"commonvoice_locale":"ady",
|
| 4656 |
"in_benchmark":false
|
| 4657 |
},
|
|
@@ -5011,7 +5011,7 @@
|
|
| 5011 |
"family":"Nakh-Daghestanian",
|
| 5012 |
"flores_path":"dar_Cyrl",
|
| 5013 |
"fleurs_tag":null,
|
| 5014 |
-
"commonvoice_hours":
|
| 5015 |
"commonvoice_locale":"dar",
|
| 5016 |
"in_benchmark":true
|
| 5017 |
},
|
|
|
|
| 7 |
"family":"Indo-European",
|
| 8 |
"flores_path":"eng_Latn",
|
| 9 |
"fleurs_tag":"en_us",
|
| 10 |
+
"commonvoice_hours":2683.0,
|
| 11 |
"commonvoice_locale":"en",
|
| 12 |
"in_benchmark":true
|
| 13 |
},
|
|
|
|
| 32 |
"flores_path":"hin_Deva",
|
| 33 |
"fleurs_tag":"hi_in",
|
| 34 |
"commonvoice_hours":16.0,
|
| 35 |
+
"commonvoice_locale":"hi",
|
| 36 |
"in_benchmark":true
|
| 37 |
},
|
| 38 |
{
|
|
|
|
| 43 |
"family":"Indo-European",
|
| 44 |
"flores_path":"spa_Latn",
|
| 45 |
"fleurs_tag":"es_419",
|
| 46 |
+
"commonvoice_hours":449.0,
|
| 47 |
"commonvoice_locale":"es",
|
| 48 |
"in_benchmark":true
|
| 49 |
},
|
|
|
|
| 79 |
"family":"Indo-European",
|
| 80 |
"flores_path":"fra_Latn",
|
| 81 |
"fleurs_tag":"fr_fr",
|
| 82 |
+
"commonvoice_hours":1072.0,
|
| 83 |
"commonvoice_locale":"fr",
|
| 84 |
"in_benchmark":true
|
| 85 |
},
|
|
|
|
| 127 |
"family":"Indo-European",
|
| 128 |
"flores_path":"rus_Cyrl",
|
| 129 |
"fleurs_tag":"ru_ru",
|
| 130 |
+
"commonvoice_hours":247.0,
|
| 131 |
"commonvoice_locale":"ru",
|
| 132 |
"in_benchmark":true
|
| 133 |
},
|
|
|
|
| 139 |
"family":"Atlantic-Congo",
|
| 140 |
"flores_path":"swh_Latn",
|
| 141 |
"fleurs_tag":"sw_ke",
|
| 142 |
+
"commonvoice_hours":412.0,
|
| 143 |
"commonvoice_locale":"sw",
|
| 144 |
"in_benchmark":true
|
| 145 |
},
|
|
|
|
| 163 |
"family":"Indo-European",
|
| 164 |
"flores_path":"deu_Latn",
|
| 165 |
"fleurs_tag":"de_de",
|
| 166 |
+
"commonvoice_hours":1372.0,
|
| 167 |
"commonvoice_locale":"de",
|
| 168 |
"in_benchmark":true
|
| 169 |
},
|
|
|
|
| 1027 |
"family":"Uralic",
|
| 1028 |
"flores_path":"hun_Latn",
|
| 1029 |
"fleurs_tag":"hu_hu",
|
| 1030 |
+
"commonvoice_hours":94.0,
|
| 1031 |
"commonvoice_locale":"hu",
|
| 1032 |
"in_benchmark":true
|
| 1033 |
},
|
|
|
|
| 1183 |
"family":"Indo-European",
|
| 1184 |
"flores_path":"bel_Cyrl",
|
| 1185 |
"fleurs_tag":"be_by",
|
| 1186 |
+
"commonvoice_hours":1812.0,
|
| 1187 |
"commonvoice_locale":"be",
|
| 1188 |
"in_benchmark":true
|
| 1189 |
},
|
|
|
|
| 1207 |
"family":"Indo-European",
|
| 1208 |
"flores_path":"tgk_Cyrl",
|
| 1209 |
"fleurs_tag":"tg_tj",
|
| 1210 |
+
"commonvoice_hours":0.6,
|
| 1211 |
"commonvoice_locale":"tg",
|
| 1212 |
"in_benchmark":true
|
| 1213 |
},
|
|
|
|
| 1291 |
"family":"Indo-European",
|
| 1292 |
"flores_path":"cat_Latn",
|
| 1293 |
"fleurs_tag":"ca_es",
|
| 1294 |
+
"commonvoice_hours":2883.0,
|
| 1295 |
"commonvoice_locale":"ca",
|
| 1296 |
"in_benchmark":true
|
| 1297 |
},
|
|
|
|
| 1303 |
"family":"Afro-Asiatic",
|
| 1304 |
"flores_path":"heb_Hebr",
|
| 1305 |
"fleurs_tag":"he_il",
|
| 1306 |
+
"commonvoice_hours":2.0,
|
| 1307 |
"commonvoice_locale":"he",
|
| 1308 |
"in_benchmark":true
|
| 1309 |
},
|
|
|
|
| 1375 |
"family":"Turkic",
|
| 1376 |
"flores_path":"uig_Arab",
|
| 1377 |
"fleurs_tag":null,
|
| 1378 |
+
"commonvoice_hours":437.0,
|
| 1379 |
"commonvoice_locale":"ug",
|
| 1380 |
"in_benchmark":true
|
| 1381 |
},
|
|
|
|
| 1519 |
"family":"Indo-European",
|
| 1520 |
"flores_path":"kmr_Latn",
|
| 1521 |
"fleurs_tag":null,
|
| 1522 |
+
"commonvoice_hours":71.0,
|
| 1523 |
"commonvoice_locale":"kmr",
|
| 1524 |
"in_benchmark":true
|
| 1525 |
},
|
|
|
|
| 1555 |
"family":"Indo-European",
|
| 1556 |
"flores_path":"slk_Latn",
|
| 1557 |
"fleurs_tag":"sk_sk",
|
| 1558 |
+
"commonvoice_hours":52.0,
|
| 1559 |
"commonvoice_locale":"sk",
|
| 1560 |
"in_benchmark":true
|
| 1561 |
},
|
|
|
|
| 1675 |
"family":"Tupian",
|
| 1676 |
"flores_path":"gug_Latn",
|
| 1677 |
"fleurs_tag":null,
|
| 1678 |
+
"commonvoice_hours":4.5,
|
| 1679 |
"commonvoice_locale":"gn",
|
| 1680 |
"in_benchmark":true
|
| 1681 |
},
|
|
|
|
| 1747 |
"family":"Indo-European",
|
| 1748 |
"flores_path":"nob_Latn",
|
| 1749 |
"fleurs_tag":"nb_no",
|
| 1750 |
+
"commonvoice_hours":1.8,
|
| 1751 |
"commonvoice_locale":"nb-NO",
|
| 1752 |
"in_benchmark":true
|
| 1753 |
},
|
|
|
|
| 2167 |
"family":"Indo-European",
|
| 2168 |
"flores_path":"glg_Latn",
|
| 2169 |
"fleurs_tag":"gl_es",
|
| 2170 |
+
"commonvoice_hours":162.0,
|
| 2171 |
"commonvoice_locale":"gl",
|
| 2172 |
"in_benchmark":true
|
| 2173 |
},
|
|
|
|
| 3175 |
"family":"Atlantic-Congo",
|
| 3176 |
"flores_path":null,
|
| 3177 |
"fleurs_tag":null,
|
| 3178 |
+
"commonvoice_hours":0.0,
|
| 3179 |
+
"commonvoice_locale":"seh",
|
| 3180 |
"in_benchmark":false
|
| 3181 |
},
|
| 3182 |
{
|
|
|
|
| 3331 |
"family":"Indo-European",
|
| 3332 |
"flores_path":"gle_Latn",
|
| 3333 |
"fleurs_tag":"ga_ie",
|
| 3334 |
+
"commonvoice_hours":9.3,
|
| 3335 |
"commonvoice_locale":"ga-IE",
|
| 3336 |
"in_benchmark":true
|
| 3337 |
},
|
|
|
|
| 3535 |
"family":null,
|
| 3536 |
"flores_path":"eus_Latn",
|
| 3537 |
"fleurs_tag":null,
|
| 3538 |
+
"commonvoice_hours":453.0,
|
| 3539 |
"commonvoice_locale":"eu",
|
| 3540 |
"in_benchmark":true
|
| 3541 |
},
|
|
|
|
| 3559 |
"family":"Abkhaz-Adyge",
|
| 3560 |
"flores_path":null,
|
| 3561 |
"fleurs_tag":null,
|
| 3562 |
+
"commonvoice_hours":106.0,
|
| 3563 |
"commonvoice_locale":"kbd",
|
| 3564 |
"in_benchmark":false
|
| 3565 |
},
|
|
|
|
| 3679 |
"family":"Indo-European",
|
| 3680 |
"flores_path":"ydd_Hebr",
|
| 3681 |
"fleurs_tag":null,
|
| 3682 |
+
"commonvoice_hours":1.7,
|
| 3683 |
"commonvoice_locale":"yi",
|
| 3684 |
"in_benchmark":true
|
| 3685 |
},
|
|
|
|
| 4099 |
"family":"Indo-European",
|
| 4100 |
"flores_path":null,
|
| 4101 |
"fleurs_tag":null,
|
| 4102 |
+
"commonvoice_hours":0.0,
|
| 4103 |
+
"commonvoice_locale":"pcd",
|
| 4104 |
"in_benchmark":false
|
| 4105 |
},
|
| 4106 |
{
|
|
|
|
| 4651 |
"family":"Abkhaz-Adyge",
|
| 4652 |
"flores_path":null,
|
| 4653 |
"fleurs_tag":null,
|
| 4654 |
+
"commonvoice_hours":32.0,
|
| 4655 |
"commonvoice_locale":"ady",
|
| 4656 |
"in_benchmark":false
|
| 4657 |
},
|
|
|
|
| 5011 |
"family":"Nakh-Daghestanian",
|
| 5012 |
"flores_path":"dar_Cyrl",
|
| 5013 |
"fleurs_tag":null,
|
| 5014 |
+
"commonvoice_hours":1.3,
|
| 5015 |
"commonvoice_locale":"dar",
|
| 5016 |
"in_benchmark":true
|
| 5017 |
},
|
models.json
CHANGED
|
@@ -1,15 +1,15 @@
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
-
"id":
|
| 4 |
-
"name":
|
| 5 |
-
"provider_name":
|
| 6 |
-
"cost":
|
| 7 |
-
"hf_id":
|
| 8 |
-
"size":
|
| 9 |
-
"type":
|
| 10 |
-
"license":
|
| 11 |
-
"creation_date":
|
| 12 |
-
"tasks":
|
| 13 |
"translation_from",
|
| 14 |
"translation_to",
|
| 15 |
"classification",
|
|
@@ -18,971 +18,5 @@
|
|
| 18 |
"truthfulqa",
|
| 19 |
"mgsm"
|
| 20 |
]
|
| 21 |
-
},
|
| 22 |
-
{
|
| 23 |
-
"id": "anthracite-org/magnum-v4-72b",
|
| 24 |
-
"name": "Magnum v4 72B",
|
| 25 |
-
"provider_name": "Magnum v4 72B",
|
| 26 |
-
"cost": 3.0,
|
| 27 |
-
"hf_id": "anthracite-org/magnum-v4-72b",
|
| 28 |
-
"size": 72706203648.0,
|
| 29 |
-
"type": "open-source",
|
| 30 |
-
"license": "Apache 2.0",
|
| 31 |
-
"creation_date": 1726790400000,
|
| 32 |
-
"tasks": [
|
| 33 |
-
"translation_from",
|
| 34 |
-
"translation_to",
|
| 35 |
-
"classification",
|
| 36 |
-
"mmlu",
|
| 37 |
-
"arc",
|
| 38 |
-
"truthfulqa",
|
| 39 |
-
"mgsm"
|
| 40 |
-
]
|
| 41 |
-
},
|
| 42 |
-
{
|
| 43 |
-
"id": "anthropic/claude-sonnet-4",
|
| 44 |
-
"name": "Claude Sonnet 4",
|
| 45 |
-
"provider_name": "Anthropic",
|
| 46 |
-
"cost": 15.0,
|
| 47 |
-
"hf_id": null,
|
| 48 |
-
"size": null,
|
| 49 |
-
"type": "closed-source",
|
| 50 |
-
"license": null,
|
| 51 |
-
"creation_date": 1747872000000,
|
| 52 |
-
"tasks": [
|
| 53 |
-
"translation_from",
|
| 54 |
-
"translation_to",
|
| 55 |
-
"classification",
|
| 56 |
-
"mmlu",
|
| 57 |
-
"arc",
|
| 58 |
-
"truthfulqa",
|
| 59 |
-
"mgsm"
|
| 60 |
-
]
|
| 61 |
-
},
|
| 62 |
-
{
|
| 63 |
-
"id": "deepseek/deepseek-chat",
|
| 64 |
-
"name": "DeepSeek V3",
|
| 65 |
-
"provider_name": "DeepSeek",
|
| 66 |
-
"cost": 0.72,
|
| 67 |
-
"hf_id": "deepseek-ai/DeepSeek-V3",
|
| 68 |
-
"size": 684531386000.0,
|
| 69 |
-
"type": "open-source",
|
| 70 |
-
"license": "",
|
| 71 |
-
"creation_date": 1735084800000,
|
| 72 |
-
"tasks": [
|
| 73 |
-
"translation_from",
|
| 74 |
-
"translation_to",
|
| 75 |
-
"classification",
|
| 76 |
-
"mmlu",
|
| 77 |
-
"arc",
|
| 78 |
-
"truthfulqa",
|
| 79 |
-
"mgsm"
|
| 80 |
-
]
|
| 81 |
-
},
|
| 82 |
-
{
|
| 83 |
-
"id": "deepseek/deepseek-chat-v3-0324",
|
| 84 |
-
"name": "DeepSeek V3 0324",
|
| 85 |
-
"provider_name": "DeepSeek",
|
| 86 |
-
"cost": 0.0,
|
| 87 |
-
"hf_id": "deepseek-ai/DeepSeek-V3-0324",
|
| 88 |
-
"size": 684531386000.0,
|
| 89 |
-
"type": "open-source",
|
| 90 |
-
"license": "Mit",
|
| 91 |
-
"creation_date": 1742774400000,
|
| 92 |
-
"tasks": [
|
| 93 |
-
"translation_from",
|
| 94 |
-
"translation_to",
|
| 95 |
-
"classification",
|
| 96 |
-
"mmlu",
|
| 97 |
-
"arc",
|
| 98 |
-
"truthfulqa",
|
| 99 |
-
"mgsm"
|
| 100 |
-
]
|
| 101 |
-
},
|
| 102 |
-
{
|
| 103 |
-
"id": "deepseek/deepseek-r1-0528",
|
| 104 |
-
"name": "R1 0528",
|
| 105 |
-
"provider_name": "DeepSeek",
|
| 106 |
-
"cost": 0.0,
|
| 107 |
-
"hf_id": "deepseek-ai/DeepSeek-R1-0528",
|
| 108 |
-
"size": 684531386000.0,
|
| 109 |
-
"type": "open-source",
|
| 110 |
-
"license": "Mit",
|
| 111 |
-
"creation_date": 1748390400000,
|
| 112 |
-
"tasks": [
|
| 113 |
-
"translation_from",
|
| 114 |
-
"translation_to",
|
| 115 |
-
"classification",
|
| 116 |
-
"mmlu",
|
| 117 |
-
"arc",
|
| 118 |
-
"truthfulqa",
|
| 119 |
-
"mgsm"
|
| 120 |
-
]
|
| 121 |
-
},
|
| 122 |
-
{
|
| 123 |
-
"id": "google/gemini-2.0-flash-lite-001",
|
| 124 |
-
"name": "Gemini 2.0 Flash Lite",
|
| 125 |
-
"provider_name": "Google",
|
| 126 |
-
"cost": 0.3,
|
| 127 |
-
"hf_id": null,
|
| 128 |
-
"size": null,
|
| 129 |
-
"type": "closed-source",
|
| 130 |
-
"license": null,
|
| 131 |
-
"creation_date": 1740441600000,
|
| 132 |
-
"tasks": [
|
| 133 |
-
"translation_from",
|
| 134 |
-
"translation_to",
|
| 135 |
-
"classification",
|
| 136 |
-
"mmlu",
|
| 137 |
-
"arc",
|
| 138 |
-
"truthfulqa",
|
| 139 |
-
"mgsm"
|
| 140 |
-
]
|
| 141 |
-
},
|
| 142 |
-
{
|
| 143 |
-
"id": "google/gemini-2.5-flash",
|
| 144 |
-
"name": "Gemini 2.5 Flash",
|
| 145 |
-
"provider_name": "Google",
|
| 146 |
-
"cost": 2.5,
|
| 147 |
-
"hf_id": null,
|
| 148 |
-
"size": null,
|
| 149 |
-
"type": "closed-source",
|
| 150 |
-
"license": null,
|
| 151 |
-
"creation_date": 1750118400000,
|
| 152 |
-
"tasks": [
|
| 153 |
-
"translation_from",
|
| 154 |
-
"translation_to",
|
| 155 |
-
"classification",
|
| 156 |
-
"mmlu",
|
| 157 |
-
"arc",
|
| 158 |
-
"truthfulqa",
|
| 159 |
-
"mgsm"
|
| 160 |
-
]
|
| 161 |
-
},
|
| 162 |
-
{
|
| 163 |
-
"id": "google/gemma-2-9b-it",
|
| 164 |
-
"name": "Gemma 2 9B",
|
| 165 |
-
"provider_name": "Google",
|
| 166 |
-
"cost": 0.0,
|
| 167 |
-
"hf_id": "google/gemma-2-9b-it",
|
| 168 |
-
"size": 9241705984.0,
|
| 169 |
-
"type": "open-source",
|
| 170 |
-
"license": "Gemma",
|
| 171 |
-
"creation_date": 1719187200000,
|
| 172 |
-
"tasks": [
|
| 173 |
-
"translation_from",
|
| 174 |
-
"translation_to",
|
| 175 |
-
"classification",
|
| 176 |
-
"mmlu",
|
| 177 |
-
"arc",
|
| 178 |
-
"truthfulqa",
|
| 179 |
-
"mgsm"
|
| 180 |
-
]
|
| 181 |
-
},
|
| 182 |
-
{
|
| 183 |
-
"id": "google/gemma-3-27b-it",
|
| 184 |
-
"name": "Gemma 3 27B",
|
| 185 |
-
"provider_name": "Google",
|
| 186 |
-
"cost": 0.0,
|
| 187 |
-
"hf_id": "google/gemma-3-27b-it",
|
| 188 |
-
"size": 27432406640.0,
|
| 189 |
-
"type": "open-source",
|
| 190 |
-
"license": "Gemma",
|
| 191 |
-
"creation_date": 1740787200000,
|
| 192 |
-
"tasks": [
|
| 193 |
-
"translation_from",
|
| 194 |
-
"translation_to",
|
| 195 |
-
"classification",
|
| 196 |
-
"mmlu",
|
| 197 |
-
"arc",
|
| 198 |
-
"truthfulqa",
|
| 199 |
-
"mgsm"
|
| 200 |
-
]
|
| 201 |
-
},
|
| 202 |
-
{
|
| 203 |
-
"id": "meta-llama/llama-3-70b-instruct",
|
| 204 |
-
"name": "Llama 3 70B Instruct",
|
| 205 |
-
"provider_name": "Meta",
|
| 206 |
-
"cost": 0.4,
|
| 207 |
-
"hf_id": "meta-llama/Meta-Llama-3-70B-Instruct",
|
| 208 |
-
"size": 70553706496.0,
|
| 209 |
-
"type": "open-source",
|
| 210 |
-
"license": "Llama3",
|
| 211 |
-
"creation_date": 1713312000000,
|
| 212 |
-
"tasks": [
|
| 213 |
-
"translation_from",
|
| 214 |
-
"translation_to",
|
| 215 |
-
"classification",
|
| 216 |
-
"mmlu",
|
| 217 |
-
"arc",
|
| 218 |
-
"truthfulqa",
|
| 219 |
-
"mgsm"
|
| 220 |
-
]
|
| 221 |
-
},
|
| 222 |
-
{
|
| 223 |
-
"id": "meta-llama/llama-3.1-70b-instruct",
|
| 224 |
-
"name": "Llama 3.1 70B Instruct",
|
| 225 |
-
"provider_name": "Meta",
|
| 226 |
-
"cost": 0.28,
|
| 227 |
-
"hf_id": "meta-llama/Llama-3.1-70B-Instruct",
|
| 228 |
-
"size": 70553706496.0,
|
| 229 |
-
"type": "open-source",
|
| 230 |
-
"license": "Llama3.1",
|
| 231 |
-
"creation_date": 1721088000000,
|
| 232 |
-
"tasks": [
|
| 233 |
-
"translation_from",
|
| 234 |
-
"translation_to",
|
| 235 |
-
"classification",
|
| 236 |
-
"mmlu",
|
| 237 |
-
"arc",
|
| 238 |
-
"truthfulqa",
|
| 239 |
-
"mgsm"
|
| 240 |
-
]
|
| 241 |
-
},
|
| 242 |
-
{
|
| 243 |
-
"id": "meta-llama/llama-3.2-3b-instruct",
|
| 244 |
-
"name": "Llama 3.2 3B Instruct",
|
| 245 |
-
"provider_name": "Meta",
|
| 246 |
-
"cost": 0.0,
|
| 247 |
-
"hf_id": "meta-llama/Llama-3.2-3B-Instruct",
|
| 248 |
-
"size": 3212749824.0,
|
| 249 |
-
"type": "open-source",
|
| 250 |
-
"license": "Llama3.2",
|
| 251 |
-
"creation_date": 1726617600000,
|
| 252 |
-
"tasks": [
|
| 253 |
-
"translation_from",
|
| 254 |
-
"translation_to",
|
| 255 |
-
"classification",
|
| 256 |
-
"mmlu",
|
| 257 |
-
"arc",
|
| 258 |
-
"truthfulqa",
|
| 259 |
-
"mgsm"
|
| 260 |
-
]
|
| 261 |
-
},
|
| 262 |
-
{
|
| 263 |
-
"id": "meta-llama/llama-3.3-70b-instruct",
|
| 264 |
-
"name": "Llama 3.3 70B Instruct",
|
| 265 |
-
"provider_name": "Meta",
|
| 266 |
-
"cost": 0.0,
|
| 267 |
-
"hf_id": "meta-llama/Llama-3.3-70B-Instruct",
|
| 268 |
-
"size": 70553706496.0,
|
| 269 |
-
"type": "open-source",
|
| 270 |
-
"license": "Llama3.3",
|
| 271 |
-
"creation_date": 1732579200000,
|
| 272 |
-
"tasks": [
|
| 273 |
-
"translation_from",
|
| 274 |
-
"translation_to",
|
| 275 |
-
"classification",
|
| 276 |
-
"mmlu",
|
| 277 |
-
"arc",
|
| 278 |
-
"truthfulqa",
|
| 279 |
-
"mgsm"
|
| 280 |
-
]
|
| 281 |
-
},
|
| 282 |
-
{
|
| 283 |
-
"id": "meta-llama/llama-4-maverick",
|
| 284 |
-
"name": "Llama 4 Maverick",
|
| 285 |
-
"provider_name": "Meta",
|
| 286 |
-
"cost": 0.6,
|
| 287 |
-
"hf_id": "meta-llama/Llama-4-Maverick-17B-128E-Instruct",
|
| 288 |
-
"size": 401583781376.0,
|
| 289 |
-
"type": "open-source",
|
| 290 |
-
"license": "Other",
|
| 291 |
-
"creation_date": 1743465600000,
|
| 292 |
-
"tasks": [
|
| 293 |
-
"translation_from",
|
| 294 |
-
"translation_to",
|
| 295 |
-
"classification",
|
| 296 |
-
"mmlu",
|
| 297 |
-
"arc",
|
| 298 |
-
"truthfulqa",
|
| 299 |
-
"mgsm"
|
| 300 |
-
]
|
| 301 |
-
},
|
| 302 |
-
{
|
| 303 |
-
"id": "meta-llama/llama-guard-4-12b",
|
| 304 |
-
"name": "Llama Guard 4 12B",
|
| 305 |
-
"provider_name": "Meta",
|
| 306 |
-
"cost": 0.18,
|
| 307 |
-
"hf_id": "meta-llama/Llama-Guard-4-12B",
|
| 308 |
-
"size": 12001097216.0,
|
| 309 |
-
"type": "open-source",
|
| 310 |
-
"license": "Other",
|
| 311 |
-
"creation_date": 1745366400000,
|
| 312 |
-
"tasks": [
|
| 313 |
-
"translation_from",
|
| 314 |
-
"translation_to",
|
| 315 |
-
"classification",
|
| 316 |
-
"mmlu",
|
| 317 |
-
"arc",
|
| 318 |
-
"truthfulqa",
|
| 319 |
-
"mgsm"
|
| 320 |
-
]
|
| 321 |
-
},
|
| 322 |
-
{
|
| 323 |
-
"id": "microsoft/phi-3-medium-128k-instruct",
|
| 324 |
-
"name": "Phi-3 Medium 128K Instruct",
|
| 325 |
-
"provider_name": "Microsoft",
|
| 326 |
-
"cost": 1.0,
|
| 327 |
-
"hf_id": "microsoft/Phi-3-medium-128k-instruct",
|
| 328 |
-
"size": 13960238080.0,
|
| 329 |
-
"type": "open-source",
|
| 330 |
-
"license": "Mit",
|
| 331 |
-
"creation_date": 1715040000000,
|
| 332 |
-
"tasks": [
|
| 333 |
-
"translation_from",
|
| 334 |
-
"translation_to",
|
| 335 |
-
"classification",
|
| 336 |
-
"mmlu",
|
| 337 |
-
"arc",
|
| 338 |
-
"truthfulqa",
|
| 339 |
-
"mgsm"
|
| 340 |
-
]
|
| 341 |
-
},
|
| 342 |
-
{
|
| 343 |
-
"id": "microsoft/phi-3.5-mini-128k-instruct",
|
| 344 |
-
"name": "Phi-3.5 Mini 128K Instruct",
|
| 345 |
-
"provider_name": "Microsoft",
|
| 346 |
-
"cost": 0.1,
|
| 347 |
-
"hf_id": "microsoft/Phi-3.5-mini-instruct",
|
| 348 |
-
"size": 3821079552.0,
|
| 349 |
-
"type": "open-source",
|
| 350 |
-
"license": "Mit",
|
| 351 |
-
"creation_date": 1723766400000,
|
| 352 |
-
"tasks": [
|
| 353 |
-
"translation_from",
|
| 354 |
-
"translation_to",
|
| 355 |
-
"classification",
|
| 356 |
-
"mmlu",
|
| 357 |
-
"arc",
|
| 358 |
-
"truthfulqa",
|
| 359 |
-
"mgsm"
|
| 360 |
-
]
|
| 361 |
-
},
|
| 362 |
-
{
|
| 363 |
-
"id": "microsoft/phi-4",
|
| 364 |
-
"name": "Phi 4",
|
| 365 |
-
"provider_name": "Microsoft",
|
| 366 |
-
"cost": 0.14,
|
| 367 |
-
"hf_id": "microsoft/phi-4",
|
| 368 |
-
"size": 14659507200.0,
|
| 369 |
-
"type": "open-source",
|
| 370 |
-
"license": "Mit",
|
| 371 |
-
"creation_date": 1733875200000,
|
| 372 |
-
"tasks": [
|
| 373 |
-
"translation_from",
|
| 374 |
-
"translation_to",
|
| 375 |
-
"classification",
|
| 376 |
-
"mmlu",
|
| 377 |
-
"arc",
|
| 378 |
-
"truthfulqa",
|
| 379 |
-
"mgsm"
|
| 380 |
-
]
|
| 381 |
-
},
|
| 382 |
-
{
|
| 383 |
-
"id": "microsoft/phi-4-multimodal-instruct",
|
| 384 |
-
"name": "Phi 4 Multimodal Instruct",
|
| 385 |
-
"provider_name": "Microsoft",
|
| 386 |
-
"cost": 0.1,
|
| 387 |
-
"hf_id": "microsoft/Phi-4-multimodal-instruct",
|
| 388 |
-
"size": 5574460384.0,
|
| 389 |
-
"type": "open-source",
|
| 390 |
-
"license": "Mit",
|
| 391 |
-
"creation_date": 1740355200000,
|
| 392 |
-
"tasks": [
|
| 393 |
-
"translation_from",
|
| 394 |
-
"translation_to",
|
| 395 |
-
"classification",
|
| 396 |
-
"mmlu",
|
| 397 |
-
"arc",
|
| 398 |
-
"truthfulqa",
|
| 399 |
-
"mgsm"
|
| 400 |
-
]
|
| 401 |
-
},
|
| 402 |
-
{
|
| 403 |
-
"id": "mistralai/magistral-medium-2506",
|
| 404 |
-
"name": "Magistral Medium 2506",
|
| 405 |
-
"provider_name": "Mistral",
|
| 406 |
-
"cost": 5.0,
|
| 407 |
-
"hf_id": null,
|
| 408 |
-
"size": null,
|
| 409 |
-
"type": "closed-source",
|
| 410 |
-
"license": null,
|
| 411 |
-
"creation_date": 1749340800000,
|
| 412 |
-
"tasks": [
|
| 413 |
-
"translation_from",
|
| 414 |
-
"translation_to",
|
| 415 |
-
"classification",
|
| 416 |
-
"mmlu",
|
| 417 |
-
"arc",
|
| 418 |
-
"truthfulqa",
|
| 419 |
-
"mgsm"
|
| 420 |
-
]
|
| 421 |
-
},
|
| 422 |
-
{
|
| 423 |
-
"id": "mistralai/mistral-7b-instruct",
|
| 424 |
-
"name": "Mistral 7B Instruct",
|
| 425 |
-
"provider_name": "Mistral",
|
| 426 |
-
"cost": 0.0,
|
| 427 |
-
"hf_id": "mistralai/Mistral-7B-Instruct-v0.3",
|
| 428 |
-
"size": 7248023552.0,
|
| 429 |
-
"type": "open-source",
|
| 430 |
-
"license": "Apache 2.0",
|
| 431 |
-
"creation_date": 1716336000000,
|
| 432 |
-
"tasks": [
|
| 433 |
-
"translation_from",
|
| 434 |
-
"translation_to",
|
| 435 |
-
"classification",
|
| 436 |
-
"mmlu",
|
| 437 |
-
"arc",
|
| 438 |
-
"truthfulqa",
|
| 439 |
-
"mgsm"
|
| 440 |
-
]
|
| 441 |
-
},
|
| 442 |
-
{
|
| 443 |
-
"id": "mistralai/mistral-nemo",
|
| 444 |
-
"name": "Mistral Nemo",
|
| 445 |
-
"provider_name": "Mistral",
|
| 446 |
-
"cost": 0.0,
|
| 447 |
-
"hf_id": "mistralai/Mistral-Nemo-Instruct-2407",
|
| 448 |
-
"size": 12247782400.0,
|
| 449 |
-
"type": "open-source",
|
| 450 |
-
"license": "Apache 2.0",
|
| 451 |
-
"creation_date": 1721174400000,
|
| 452 |
-
"tasks": [
|
| 453 |
-
"translation_from",
|
| 454 |
-
"translation_to",
|
| 455 |
-
"classification",
|
| 456 |
-
"mmlu",
|
| 457 |
-
"arc",
|
| 458 |
-
"truthfulqa",
|
| 459 |
-
"mgsm"
|
| 460 |
-
]
|
| 461 |
-
},
|
| 462 |
-
{
|
| 463 |
-
"id": "mistralai/mistral-saba",
|
| 464 |
-
"name": "Saba",
|
| 465 |
-
"provider_name": "Mistral",
|
| 466 |
-
"cost": 0.6,
|
| 467 |
-
"hf_id": null,
|
| 468 |
-
"size": null,
|
| 469 |
-
"type": "closed-source",
|
| 470 |
-
"license": null,
|
| 471 |
-
"creation_date": 1739750400000,
|
| 472 |
-
"tasks": [
|
| 473 |
-
"translation_from",
|
| 474 |
-
"translation_to",
|
| 475 |
-
"classification",
|
| 476 |
-
"mmlu",
|
| 477 |
-
"arc",
|
| 478 |
-
"truthfulqa",
|
| 479 |
-
"mgsm"
|
| 480 |
-
]
|
| 481 |
-
},
|
| 482 |
-
{
|
| 483 |
-
"id": "mistralai/mistral-small-3.1-24b-instruct",
|
| 484 |
-
"name": "Mistral Small 3.1 24B",
|
| 485 |
-
"provider_name": "Mistral",
|
| 486 |
-
"cost": 0.0,
|
| 487 |
-
"hf_id": "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
|
| 488 |
-
"size": 24011361280.0,
|
| 489 |
-
"type": "open-source",
|
| 490 |
-
"license": "Apache 2.0",
|
| 491 |
-
"creation_date": 1741651200000,
|
| 492 |
-
"tasks": [
|
| 493 |
-
"translation_from",
|
| 494 |
-
"translation_to",
|
| 495 |
-
"classification",
|
| 496 |
-
"mmlu",
|
| 497 |
-
"arc",
|
| 498 |
-
"truthfulqa",
|
| 499 |
-
"mgsm"
|
| 500 |
-
]
|
| 501 |
-
},
|
| 502 |
-
{
|
| 503 |
-
"id": "mistralai/mixtral-8x7b-instruct",
|
| 504 |
-
"name": "Mixtral 8x7B Instruct",
|
| 505 |
-
"provider_name": "Mistral",
|
| 506 |
-
"cost": 0.24,
|
| 507 |
-
"hf_id": "mistralai/Mixtral-8x7B-Instruct-v0.1",
|
| 508 |
-
"size": 46702792704.0,
|
| 509 |
-
"type": "open-source",
|
| 510 |
-
"license": "Apache 2.0",
|
| 511 |
-
"creation_date": 1702166400000,
|
| 512 |
-
"tasks": [
|
| 513 |
-
"translation_from",
|
| 514 |
-
"translation_to",
|
| 515 |
-
"classification",
|
| 516 |
-
"mmlu",
|
| 517 |
-
"arc",
|
| 518 |
-
"truthfulqa",
|
| 519 |
-
"mgsm"
|
| 520 |
-
]
|
| 521 |
-
},
|
| 522 |
-
{
|
| 523 |
-
"id": "neversleep/llama-3-lumimaid-70b",
|
| 524 |
-
"name": "Llama 3 Lumimaid 70B",
|
| 525 |
-
"provider_name": "NeverSleep",
|
| 526 |
-
"cost": 6.0,
|
| 527 |
-
"hf_id": "NeverSleep/Llama-3-Lumimaid-70B-v0.1",
|
| 528 |
-
"size": 70553706496.0,
|
| 529 |
-
"type": "open-source",
|
| 530 |
-
"license": "Cc By Nc 4.0",
|
| 531 |
-
"creation_date": 1714262400000,
|
| 532 |
-
"tasks": [
|
| 533 |
-
"translation_from",
|
| 534 |
-
"translation_to",
|
| 535 |
-
"classification",
|
| 536 |
-
"mmlu",
|
| 537 |
-
"arc",
|
| 538 |
-
"truthfulqa",
|
| 539 |
-
"mgsm"
|
| 540 |
-
]
|
| 541 |
-
},
|
| 542 |
-
{
|
| 543 |
-
"id": "nvidia/llama-3.1-nemotron-70b-instruct",
|
| 544 |
-
"name": "Llama 3.1 Nemotron 70B Instruct",
|
| 545 |
-
"provider_name": "NVIDIA",
|
| 546 |
-
"cost": 0.3,
|
| 547 |
-
"hf_id": "nvidia/Llama-3.1-Nemotron-70B-Instruct-HF",
|
| 548 |
-
"size": 70553706496.0,
|
| 549 |
-
"type": "open-source",
|
| 550 |
-
"license": "Llama3.1",
|
| 551 |
-
"creation_date": 1728691200000,
|
| 552 |
-
"tasks": [
|
| 553 |
-
"translation_from",
|
| 554 |
-
"translation_to",
|
| 555 |
-
"classification",
|
| 556 |
-
"mmlu",
|
| 557 |
-
"arc",
|
| 558 |
-
"truthfulqa",
|
| 559 |
-
"mgsm"
|
| 560 |
-
]
|
| 561 |
-
},
|
| 562 |
-
{
|
| 563 |
-
"id": "openai/chatgpt-4o-latest",
|
| 564 |
-
"name": "ChatGPT-4o",
|
| 565 |
-
"provider_name": "OpenAI",
|
| 566 |
-
"cost": 15.0,
|
| 567 |
-
"hf_id": null,
|
| 568 |
-
"size": null,
|
| 569 |
-
"type": "closed-source",
|
| 570 |
-
"license": null,
|
| 571 |
-
"creation_date": 1723593600000,
|
| 572 |
-
"tasks": [
|
| 573 |
-
"translation_from",
|
| 574 |
-
"translation_to",
|
| 575 |
-
"classification",
|
| 576 |
-
"mmlu",
|
| 577 |
-
"arc",
|
| 578 |
-
"truthfulqa",
|
| 579 |
-
"mgsm"
|
| 580 |
-
]
|
| 581 |
-
},
|
| 582 |
-
{
|
| 583 |
-
"id": "openai/gpt-3.5-turbo",
|
| 584 |
-
"name": "GPT-3.5 Turbo",
|
| 585 |
-
"provider_name": "OpenAI",
|
| 586 |
-
"cost": 1.5,
|
| 587 |
-
"hf_id": null,
|
| 588 |
-
"size": null,
|
| 589 |
-
"type": "closed-source",
|
| 590 |
-
"license": null,
|
| 591 |
-
"creation_date": 1685232000000,
|
| 592 |
-
"tasks": [
|
| 593 |
-
"translation_from",
|
| 594 |
-
"translation_to",
|
| 595 |
-
"classification",
|
| 596 |
-
"mmlu",
|
| 597 |
-
"arc",
|
| 598 |
-
"truthfulqa",
|
| 599 |
-
"mgsm"
|
| 600 |
-
]
|
| 601 |
-
},
|
| 602 |
-
{
|
| 603 |
-
"id": "openai/gpt-3.5-turbo-0613",
|
| 604 |
-
"name": "GPT-3.5 Turbo (older v0613)",
|
| 605 |
-
"provider_name": "OpenAI",
|
| 606 |
-
"cost": 2.0,
|
| 607 |
-
"hf_id": null,
|
| 608 |
-
"size": null,
|
| 609 |
-
"type": "closed-source",
|
| 610 |
-
"license": null,
|
| 611 |
-
"creation_date": 1706140800000,
|
| 612 |
-
"tasks": [
|
| 613 |
-
"translation_from",
|
| 614 |
-
"translation_to",
|
| 615 |
-
"classification",
|
| 616 |
-
"mmlu",
|
| 617 |
-
"arc",
|
| 618 |
-
"truthfulqa",
|
| 619 |
-
"mgsm"
|
| 620 |
-
]
|
| 621 |
-
},
|
| 622 |
-
{
|
| 623 |
-
"id": "openai/gpt-4.1",
|
| 624 |
-
"name": "GPT-4.1",
|
| 625 |
-
"provider_name": "OpenAI",
|
| 626 |
-
"cost": 8.0,
|
| 627 |
-
"hf_id": null,
|
| 628 |
-
"size": null,
|
| 629 |
-
"type": "closed-source",
|
| 630 |
-
"license": null,
|
| 631 |
-
"creation_date": 1744588800000,
|
| 632 |
-
"tasks": [
|
| 633 |
-
"translation_from",
|
| 634 |
-
"translation_to",
|
| 635 |
-
"classification",
|
| 636 |
-
"mmlu",
|
| 637 |
-
"arc",
|
| 638 |
-
"truthfulqa",
|
| 639 |
-
"mgsm"
|
| 640 |
-
]
|
| 641 |
-
},
|
| 642 |
-
{
|
| 643 |
-
"id": "openai/gpt-4.1-mini",
|
| 644 |
-
"name": "GPT-4.1 Mini",
|
| 645 |
-
"provider_name": "OpenAI",
|
| 646 |
-
"cost": 1.6,
|
| 647 |
-
"hf_id": null,
|
| 648 |
-
"size": null,
|
| 649 |
-
"type": "closed-source",
|
| 650 |
-
"license": null,
|
| 651 |
-
"creation_date": 1744588800000,
|
| 652 |
-
"tasks": [
|
| 653 |
-
"translation_from",
|
| 654 |
-
"translation_to",
|
| 655 |
-
"classification",
|
| 656 |
-
"mmlu",
|
| 657 |
-
"arc",
|
| 658 |
-
"truthfulqa",
|
| 659 |
-
"mgsm"
|
| 660 |
-
]
|
| 661 |
-
},
|
| 662 |
-
{
|
| 663 |
-
"id": "openai/gpt-4.1-nano",
|
| 664 |
-
"name": "GPT-4.1 Nano",
|
| 665 |
-
"provider_name": "OpenAI",
|
| 666 |
-
"cost": 0.4,
|
| 667 |
-
"hf_id": null,
|
| 668 |
-
"size": null,
|
| 669 |
-
"type": "closed-source",
|
| 670 |
-
"license": null,
|
| 671 |
-
"creation_date": 1744588800000,
|
| 672 |
-
"tasks": [
|
| 673 |
-
"translation_from",
|
| 674 |
-
"translation_to",
|
| 675 |
-
"classification",
|
| 676 |
-
"mmlu",
|
| 677 |
-
"arc",
|
| 678 |
-
"truthfulqa",
|
| 679 |
-
"mgsm"
|
| 680 |
-
]
|
| 681 |
-
},
|
| 682 |
-
{
|
| 683 |
-
"id": "openai/gpt-4o-2024-11-20",
|
| 684 |
-
"name": "GPT-4o (2024-11-20)",
|
| 685 |
-
"provider_name": "OpenAI",
|
| 686 |
-
"cost": 10.0,
|
| 687 |
-
"hf_id": null,
|
| 688 |
-
"size": null,
|
| 689 |
-
"type": "closed-source",
|
| 690 |
-
"license": null,
|
| 691 |
-
"creation_date": 1732060800000,
|
| 692 |
-
"tasks": [
|
| 693 |
-
"translation_from",
|
| 694 |
-
"translation_to",
|
| 695 |
-
"classification",
|
| 696 |
-
"mmlu",
|
| 697 |
-
"arc",
|
| 698 |
-
"truthfulqa",
|
| 699 |
-
"mgsm"
|
| 700 |
-
]
|
| 701 |
-
},
|
| 702 |
-
{
|
| 703 |
-
"id": "openai/gpt-4o-mini",
|
| 704 |
-
"name": "GPT-4o-mini",
|
| 705 |
-
"provider_name": "OpenAI",
|
| 706 |
-
"cost": 0.6,
|
| 707 |
-
"hf_id": null,
|
| 708 |
-
"size": null,
|
| 709 |
-
"type": "closed-source",
|
| 710 |
-
"license": null,
|
| 711 |
-
"creation_date": 1721260800000,
|
| 712 |
-
"tasks": [
|
| 713 |
-
"translation_from",
|
| 714 |
-
"translation_to",
|
| 715 |
-
"classification",
|
| 716 |
-
"mmlu",
|
| 717 |
-
"arc",
|
| 718 |
-
"truthfulqa",
|
| 719 |
-
"mgsm"
|
| 720 |
-
]
|
| 721 |
-
},
|
| 722 |
-
{
|
| 723 |
-
"id": "openai/gpt-5",
|
| 724 |
-
"name": "GPT-5",
|
| 725 |
-
"provider_name": "OpenAI",
|
| 726 |
-
"cost": 10.0,
|
| 727 |
-
"hf_id": null,
|
| 728 |
-
"size": null,
|
| 729 |
-
"type": "closed-source",
|
| 730 |
-
"license": null,
|
| 731 |
-
"creation_date": 1754524800000,
|
| 732 |
-
"tasks": [
|
| 733 |
-
"translation_from",
|
| 734 |
-
"translation_to",
|
| 735 |
-
"classification",
|
| 736 |
-
"mmlu",
|
| 737 |
-
"arc",
|
| 738 |
-
"truthfulqa",
|
| 739 |
-
"mgsm"
|
| 740 |
-
]
|
| 741 |
-
},
|
| 742 |
-
{
|
| 743 |
-
"id": "opengvlab/internvl3-14b",
|
| 744 |
-
"name": "InternVL3 14B",
|
| 745 |
-
"provider_name": "OpenGVLab",
|
| 746 |
-
"cost": 0.4,
|
| 747 |
-
"hf_id": "OpenGVLab/InternVL3-14B",
|
| 748 |
-
"size": 15117256704.0,
|
| 749 |
-
"type": "open-source",
|
| 750 |
-
"license": "Apache 2.0",
|
| 751 |
-
"creation_date": 1744243200000,
|
| 752 |
-
"tasks": [
|
| 753 |
-
"translation_from",
|
| 754 |
-
"translation_to",
|
| 755 |
-
"classification",
|
| 756 |
-
"mmlu",
|
| 757 |
-
"arc",
|
| 758 |
-
"truthfulqa",
|
| 759 |
-
"mgsm"
|
| 760 |
-
]
|
| 761 |
-
},
|
| 762 |
-
{
|
| 763 |
-
"id": "qwen/qwen3-235b-a22b",
|
| 764 |
-
"name": "Qwen3 235B A22B",
|
| 765 |
-
"provider_name": "Qwen",
|
| 766 |
-
"cost": 0.0,
|
| 767 |
-
"hf_id": "Qwen/Qwen3-235B-A22B",
|
| 768 |
-
"size": 235093634560.0,
|
| 769 |
-
"type": "open-source",
|
| 770 |
-
"license": "Apache 2.0",
|
| 771 |
-
"creation_date": 1745712000000,
|
| 772 |
-
"tasks": [
|
| 773 |
-
"translation_from",
|
| 774 |
-
"translation_to",
|
| 775 |
-
"classification",
|
| 776 |
-
"mmlu",
|
| 777 |
-
"arc",
|
| 778 |
-
"truthfulqa",
|
| 779 |
-
"mgsm"
|
| 780 |
-
]
|
| 781 |
-
},
|
| 782 |
-
{
|
| 783 |
-
"id": "qwen/qwen3-30b-a3b",
|
| 784 |
-
"name": "Qwen3 30B A3B",
|
| 785 |
-
"provider_name": "Qwen",
|
| 786 |
-
"cost": 0.0,
|
| 787 |
-
"hf_id": "Qwen/Qwen3-30B-A3B",
|
| 788 |
-
"size": 30532122624.0,
|
| 789 |
-
"type": "open-source",
|
| 790 |
-
"license": "Apache 2.0",
|
| 791 |
-
"creation_date": 1745712000000,
|
| 792 |
-
"tasks": [
|
| 793 |
-
"translation_from",
|
| 794 |
-
"translation_to",
|
| 795 |
-
"classification",
|
| 796 |
-
"mmlu",
|
| 797 |
-
"arc",
|
| 798 |
-
"truthfulqa",
|
| 799 |
-
"mgsm"
|
| 800 |
-
]
|
| 801 |
-
},
|
| 802 |
-
{
|
| 803 |
-
"id": "qwen/qwen3-32b",
|
| 804 |
-
"name": "Qwen3 32B",
|
| 805 |
-
"provider_name": "Qwen",
|
| 806 |
-
"cost": 0.07,
|
| 807 |
-
"hf_id": "Qwen/Qwen3-32B",
|
| 808 |
-
"size": 32762123264.0,
|
| 809 |
-
"type": "open-source",
|
| 810 |
-
"license": "Apache 2.0",
|
| 811 |
-
"creation_date": 1745712000000,
|
| 812 |
-
"tasks": [
|
| 813 |
-
"translation_from",
|
| 814 |
-
"translation_to",
|
| 815 |
-
"classification",
|
| 816 |
-
"mmlu",
|
| 817 |
-
"arc",
|
| 818 |
-
"truthfulqa",
|
| 819 |
-
"mgsm"
|
| 820 |
-
]
|
| 821 |
-
},
|
| 822 |
-
{
|
| 823 |
-
"id": "qwen/qwq-32b",
|
| 824 |
-
"name": "QwQ 32B",
|
| 825 |
-
"provider_name": "Qwen",
|
| 826 |
-
"cost": 0.0,
|
| 827 |
-
"hf_id": "Qwen/QwQ-32B",
|
| 828 |
-
"size": 32763876352.0,
|
| 829 |
-
"type": "open-source",
|
| 830 |
-
"license": "Apache 2.0",
|
| 831 |
-
"creation_date": 1741132800000,
|
| 832 |
-
"tasks": [
|
| 833 |
-
"translation_from",
|
| 834 |
-
"translation_to",
|
| 835 |
-
"classification",
|
| 836 |
-
"mmlu",
|
| 837 |
-
"arc",
|
| 838 |
-
"truthfulqa",
|
| 839 |
-
"mgsm"
|
| 840 |
-
]
|
| 841 |
-
},
|
| 842 |
-
{
|
| 843 |
-
"id": "switchpoint/router",
|
| 844 |
-
"name": "Switchpoint Router",
|
| 845 |
-
"provider_name": "Switchpoint Router",
|
| 846 |
-
"cost": 3.4,
|
| 847 |
-
"hf_id": null,
|
| 848 |
-
"size": null,
|
| 849 |
-
"type": "closed-source",
|
| 850 |
-
"license": null,
|
| 851 |
-
"creation_date": 1752192000000,
|
| 852 |
-
"tasks": [
|
| 853 |
-
"translation_from",
|
| 854 |
-
"translation_to",
|
| 855 |
-
"classification",
|
| 856 |
-
"mmlu",
|
| 857 |
-
"arc",
|
| 858 |
-
"truthfulqa",
|
| 859 |
-
"mgsm"
|
| 860 |
-
]
|
| 861 |
-
},
|
| 862 |
-
{
|
| 863 |
-
"id": "thedrummer/anubis-pro-105b-v1",
|
| 864 |
-
"name": "Anubis Pro 105B V1",
|
| 865 |
-
"provider_name": "TheDrummer",
|
| 866 |
-
"cost": 1.0,
|
| 867 |
-
"hf_id": "TheDrummer/Anubis-Pro-105B-v1",
|
| 868 |
-
"size": 104779882496.0,
|
| 869 |
-
"type": "open-source",
|
| 870 |
-
"license": "Other",
|
| 871 |
-
"creation_date": 1738454400000,
|
| 872 |
-
"tasks": [
|
| 873 |
-
"translation_from",
|
| 874 |
-
"translation_to",
|
| 875 |
-
"classification",
|
| 876 |
-
"mmlu",
|
| 877 |
-
"arc",
|
| 878 |
-
"truthfulqa",
|
| 879 |
-
"mgsm"
|
| 880 |
-
]
|
| 881 |
-
},
|
| 882 |
-
{
|
| 883 |
-
"id": "thedrummer/skyfall-36b-v2",
|
| 884 |
-
"name": "Skyfall 36B V2",
|
| 885 |
-
"provider_name": "TheDrummer",
|
| 886 |
-
"cost": 0.19,
|
| 887 |
-
"hf_id": "TheDrummer/Skyfall-36B-v2",
|
| 888 |
-
"size": 36910535680.0,
|
| 889 |
-
"type": "open-source",
|
| 890 |
-
"license": "Other",
|
| 891 |
-
"creation_date": 1738540800000,
|
| 892 |
-
"tasks": [
|
| 893 |
-
"translation_from",
|
| 894 |
-
"translation_to",
|
| 895 |
-
"classification",
|
| 896 |
-
"mmlu",
|
| 897 |
-
"arc",
|
| 898 |
-
"truthfulqa",
|
| 899 |
-
"mgsm"
|
| 900 |
-
]
|
| 901 |
-
},
|
| 902 |
-
{
|
| 903 |
-
"id": "tngtech/deepseek-r1t-chimera",
|
| 904 |
-
"name": "DeepSeek R1T Chimera",
|
| 905 |
-
"provider_name": "TNG",
|
| 906 |
-
"cost": 0.0,
|
| 907 |
-
"hf_id": "tngtech/DeepSeek-R1T-Chimera",
|
| 908 |
-
"size": 684531386000.0,
|
| 909 |
-
"type": "open-source",
|
| 910 |
-
"license": "Mit",
|
| 911 |
-
"creation_date": 1745625600000,
|
| 912 |
-
"tasks": [
|
| 913 |
-
"translation_from",
|
| 914 |
-
"translation_to",
|
| 915 |
-
"classification",
|
| 916 |
-
"mmlu",
|
| 917 |
-
"arc",
|
| 918 |
-
"truthfulqa",
|
| 919 |
-
"mgsm"
|
| 920 |
-
]
|
| 921 |
-
},
|
| 922 |
-
{
|
| 923 |
-
"id": "tngtech/deepseek-r1t2-chimera",
|
| 924 |
-
"name": "DeepSeek R1T2 Chimera",
|
| 925 |
-
"provider_name": "TNG",
|
| 926 |
-
"cost": 0.0,
|
| 927 |
-
"hf_id": "tngtech/DeepSeek-TNG-R1T2-Chimera",
|
| 928 |
-
"size": 684531386000.0,
|
| 929 |
-
"type": "open-source",
|
| 930 |
-
"license": "Mit",
|
| 931 |
-
"creation_date": 1751414400000,
|
| 932 |
-
"tasks": [
|
| 933 |
-
"translation_from",
|
| 934 |
-
"translation_to",
|
| 935 |
-
"classification",
|
| 936 |
-
"mmlu",
|
| 937 |
-
"arc",
|
| 938 |
-
"truthfulqa",
|
| 939 |
-
"mgsm"
|
| 940 |
-
]
|
| 941 |
-
},
|
| 942 |
-
{
|
| 943 |
-
"id": "x-ai/grok-2-1212",
|
| 944 |
-
"name": "Grok 2 1212",
|
| 945 |
-
"provider_name": "xAI",
|
| 946 |
-
"cost": 10.0,
|
| 947 |
-
"hf_id": null,
|
| 948 |
-
"size": null,
|
| 949 |
-
"type": "closed-source",
|
| 950 |
-
"license": null,
|
| 951 |
-
"creation_date": 1734220800000,
|
| 952 |
-
"tasks": [
|
| 953 |
-
"translation_from",
|
| 954 |
-
"translation_to",
|
| 955 |
-
"classification",
|
| 956 |
-
"mmlu",
|
| 957 |
-
"arc",
|
| 958 |
-
"truthfulqa",
|
| 959 |
-
"mgsm"
|
| 960 |
-
]
|
| 961 |
-
},
|
| 962 |
-
{
|
| 963 |
-
"id": "google/translate-v2",
|
| 964 |
-
"name": "Google Translate",
|
| 965 |
-
"provider_name": "Google",
|
| 966 |
-
"cost": 20.0,
|
| 967 |
-
"hf_id": null,
|
| 968 |
-
"size": null,
|
| 969 |
-
"type": "closed-source",
|
| 970 |
-
"license": null,
|
| 971 |
-
"creation_date": null,
|
| 972 |
-
"tasks": [
|
| 973 |
-
"translation_from",
|
| 974 |
-
"translation_to"
|
| 975 |
-
]
|
| 976 |
-
},
|
| 977 |
-
{
|
| 978 |
-
"id": "moonshotai/kimi-k2",
|
| 979 |
-
"name": "Kimi K2",
|
| 980 |
-
"provider_name": "Moonshot AI",
|
| 981 |
-
"size": null,
|
| 982 |
-
"type": "closed-source",
|
| 983 |
-
"cost": 0.6,
|
| 984 |
-
"hf_id": null,
|
| 985 |
-
"creation_date": null,
|
| 986 |
-
"license": null
|
| 987 |
}
|
| 988 |
]
|
|
|
|
| 1 |
[
|
| 2 |
{
|
| 3 |
+
"id":"openai\/gpt-5-nano",
|
| 4 |
+
"name":"GPT-5 Nano",
|
| 5 |
+
"provider_name":"OpenAI",
|
| 6 |
+
"cost":0.4,
|
| 7 |
+
"hf_id":null,
|
| 8 |
+
"size":null,
|
| 9 |
+
"type":"closed-source",
|
| 10 |
+
"license":null,
|
| 11 |
+
"creation_date":1754524800000,
|
| 12 |
+
"tasks":[
|
| 13 |
"translation_from",
|
| 14 |
"translation_to",
|
| 15 |
"classification",
|
|
|
|
| 18 |
"truthfulqa",
|
| 19 |
"mgsm"
|
| 20 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 21 |
}
|
| 22 |
]
|
results.json
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:afcbf2e565f584c3e57fbdbd788e12aaa887f421e04249ab35a8a9fcf94ad6b4
|
| 3 |
+
size 8030558
|