Spaces:
Running
Running
Upload from GitHub Actions: Try moving `cache` calls that cause CI issues
Browse files- evals/models.py +5 -4
- evals/tasks.py +6 -11
evals/models.py
CHANGED
|
@@ -93,9 +93,10 @@ def get_current_popular_models(date: date):
|
|
| 93 |
return [get_model(model["model_permaslug"]) for model in data]
|
| 94 |
|
| 95 |
|
| 96 |
-
popular_models =
|
| 97 |
-
date.today()
|
| 98 |
-
|
|
|
|
| 99 |
popular_models = [get_model(m) for m in popular_models if get_model(m)]
|
| 100 |
popular_models = [
|
| 101 |
m for m in popular_models if m["endpoint"] and not m["endpoint"]["is_free"]
|
|
@@ -104,7 +105,7 @@ popular_models = [m["slug"] for m in popular_models]
|
|
| 104 |
popular_models = [
|
| 105 |
m for m in popular_models if m and m not in models and m not in blocklist
|
| 106 |
]
|
| 107 |
-
models += popular_models
|
| 108 |
|
| 109 |
load_dotenv()
|
| 110 |
client = AsyncOpenAI(
|
|
|
|
| 93 |
return [get_model(model["model_permaslug"]) for model in data]
|
| 94 |
|
| 95 |
|
| 96 |
+
popular_models = (
|
| 97 |
+
get_historical_popular_models(date.today())[:5]
|
| 98 |
+
+ get_current_popular_models(date.today())[:5]
|
| 99 |
+
)
|
| 100 |
popular_models = [get_model(m) for m in popular_models if get_model(m)]
|
| 101 |
popular_models = [
|
| 102 |
m for m in popular_models if m["endpoint"] and not m["endpoint"]["is_free"]
|
|
|
|
| 105 |
popular_models = [
|
| 106 |
m for m in popular_models if m and m not in models and m not in blocklist
|
| 107 |
]
|
| 108 |
+
models += popular_models
|
| 109 |
|
| 110 |
load_dotenv()
|
| 111 |
client = AsyncOpenAI(
|
evals/tasks.py
CHANGED
|
@@ -24,7 +24,6 @@ target_languages = languages[languages["in_benchmark"]].sample(
|
|
| 24 |
)
|
| 25 |
|
| 26 |
|
| 27 |
-
@cache
|
| 28 |
async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
| 29 |
original_language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
| 30 |
target_language = target_languages.iloc[sentence_nr]
|
|
@@ -78,7 +77,6 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
| 78 |
# metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t")
|
| 79 |
|
| 80 |
|
| 81 |
-
@cache
|
| 82 |
async def classify_and_evaluate(model, bcp_47, nr):
|
| 83 |
language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
| 84 |
sentences = flores_sentences(language)
|
|
@@ -161,7 +159,6 @@ def corrupt_sentence(sentence):
|
|
| 161 |
return sentence[:start] + "<mask>" + sentence[end:]
|
| 162 |
|
| 163 |
|
| 164 |
-
@cache
|
| 165 |
async def mlm_and_evaluate(model, language_bcp_47, nr):
|
| 166 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
| 167 |
sentences = flores_sentences(language)
|
|
@@ -206,7 +203,6 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
|
|
| 206 |
]
|
| 207 |
|
| 208 |
|
| 209 |
-
@cache
|
| 210 |
async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
| 211 |
ds_name, examples, task = load_mmlu(language_bcp_47, nr)
|
| 212 |
if not task:
|
|
@@ -254,7 +250,6 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
|
| 254 |
]
|
| 255 |
|
| 256 |
|
| 257 |
-
@cache
|
| 258 |
async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
| 259 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
| 260 |
fleurs = pd.read_csv(
|
|
@@ -287,10 +282,10 @@ async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
|
| 287 |
|
| 288 |
|
| 289 |
tasks = {
|
| 290 |
-
"translation_from": partial(translate_and_evaluate, mode="from"),
|
| 291 |
-
"translation_to": partial(translate_and_evaluate, mode="to"),
|
| 292 |
-
# "classification": classify_and_evaluate,
|
| 293 |
-
# "mlm": mlm_and_evaluate,
|
| 294 |
-
"mmlu": mmlu_and_evaluate,
|
| 295 |
-
# "asr": transcribe_and_evaluate,
|
| 296 |
}
|
|
|
|
| 24 |
)
|
| 25 |
|
| 26 |
|
|
|
|
| 27 |
async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
| 28 |
original_language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
| 29 |
target_language = target_languages.iloc[sentence_nr]
|
|
|
|
| 77 |
# metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t")
|
| 78 |
|
| 79 |
|
|
|
|
| 80 |
async def classify_and_evaluate(model, bcp_47, nr):
|
| 81 |
language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
| 82 |
sentences = flores_sentences(language)
|
|
|
|
| 159 |
return sentence[:start] + "<mask>" + sentence[end:]
|
| 160 |
|
| 161 |
|
|
|
|
| 162 |
async def mlm_and_evaluate(model, language_bcp_47, nr):
|
| 163 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
| 164 |
sentences = flores_sentences(language)
|
|
|
|
| 203 |
]
|
| 204 |
|
| 205 |
|
|
|
|
| 206 |
async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
| 207 |
ds_name, examples, task = load_mmlu(language_bcp_47, nr)
|
| 208 |
if not task:
|
|
|
|
| 250 |
]
|
| 251 |
|
| 252 |
|
|
|
|
| 253 |
async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
| 254 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
| 255 |
fleurs = pd.read_csv(
|
|
|
|
| 282 |
|
| 283 |
|
| 284 |
tasks = {
|
| 285 |
+
"translation_from": cache(partial(translate_and_evaluate, mode="from")),
|
| 286 |
+
"translation_to": cache(partial(translate_and_evaluate, mode="to")),
|
| 287 |
+
# "classification": cache(classify_and_evaluate),
|
| 288 |
+
# "mlm": cache(mlm_and_evaluate),
|
| 289 |
+
"mmlu": cache(mmlu_and_evaluate),
|
| 290 |
+
# "asr": cache(transcribe_and_evaluate),
|
| 291 |
}
|