Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
feat: add is_anonymous field
Browse files- app.py +2 -2
- src/display/utils.py +5 -0
- src/read_evals.py +6 -2
- src/utils.py +3 -3
app.py
CHANGED
|
@@ -315,7 +315,7 @@ with demo:
|
|
| 315 |
with gr.Row():
|
| 316 |
file_output = gr.File()
|
| 317 |
with gr.Row():
|
| 318 |
-
|
| 319 |
label="Nope. I want to submit anonymously 🥷",
|
| 320 |
value=False,
|
| 321 |
info="Do you want to shown on the leaderboard by default?")
|
|
@@ -336,7 +336,7 @@ with demo:
|
|
| 336 |
model_name,
|
| 337 |
model_url,
|
| 338 |
benchmark_version,
|
| 339 |
-
|
| 340 |
],
|
| 341 |
submission_result,
|
| 342 |
show_progress="hidden"
|
|
|
|
| 315 |
with gr.Row():
|
| 316 |
file_output = gr.File()
|
| 317 |
with gr.Row():
|
| 318 |
+
is_anonymous = gr.Checkbox(
|
| 319 |
label="Nope. I want to submit anonymously 🥷",
|
| 320 |
value=False,
|
| 321 |
info="Do you want to shown on the leaderboard by default?")
|
|
|
|
| 336 |
model_name,
|
| 337 |
model_url,
|
| 338 |
benchmark_version,
|
| 339 |
+
is_anonymous
|
| 340 |
],
|
| 341 |
submission_result,
|
| 342 |
show_progress="hidden"
|
src/display/utils.py
CHANGED
|
@@ -27,6 +27,7 @@ COL_NAME_RERANKING_MODEL_LINK = "Reranking Model LINK"
|
|
| 27 |
COL_NAME_RANK = "Rank 🏆"
|
| 28 |
COL_NAME_REVISION = "Revision"
|
| 29 |
COL_NAME_TIMESTAMP = "Submission Date"
|
|
|
|
| 30 |
|
| 31 |
|
| 32 |
def get_default_auto_eval_column_dict():
|
|
@@ -56,8 +57,12 @@ def get_default_auto_eval_column_dict():
|
|
| 56 |
auto_eval_column_dict.append(
|
| 57 |
["reranking_model_link", ColumnContent, ColumnContent(COL_NAME_RERANKING_MODEL, "markdown", False, hidden=True, never_hidden=False)]
|
| 58 |
)
|
|
|
|
|
|
|
|
|
|
| 59 |
return auto_eval_column_dict
|
| 60 |
|
|
|
|
| 61 |
def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
|
| 62 |
auto_eval_column_dict = get_default_auto_eval_column_dict()
|
| 63 |
## Leaderboard columns
|
|
|
|
| 27 |
COL_NAME_RANK = "Rank 🏆"
|
| 28 |
COL_NAME_REVISION = "Revision"
|
| 29 |
COL_NAME_TIMESTAMP = "Submission Date"
|
| 30 |
+
COL_NAME_IS_ANONYMOUS = "Anonymous Submission"
|
| 31 |
|
| 32 |
|
| 33 |
def get_default_auto_eval_column_dict():
|
|
|
|
| 57 |
auto_eval_column_dict.append(
|
| 58 |
["reranking_model_link", ColumnContent, ColumnContent(COL_NAME_RERANKING_MODEL, "markdown", False, hidden=True, never_hidden=False)]
|
| 59 |
)
|
| 60 |
+
auto_eval_column_dict.append(
|
| 61 |
+
["is_anonymous", ColumnContent, ColumnContent(COL_NAME_IS_ANONYMOUS, "bool", False, hidden=True)]
|
| 62 |
+
)
|
| 63 |
return auto_eval_column_dict
|
| 64 |
|
| 65 |
+
|
| 66 |
def make_autoevalcolumn(cls_name="BenchmarksQA", benchmarks=BenchmarksQA):
|
| 67 |
auto_eval_column_dict = get_default_auto_eval_column_dict()
|
| 68 |
## Leaderboard columns
|
src/read_evals.py
CHANGED
|
@@ -40,6 +40,7 @@ class EvalResult:
|
|
| 40 |
metric: str
|
| 41 |
timestamp: str = "" # submission timestamp
|
| 42 |
revision: str = ""
|
|
|
|
| 43 |
|
| 44 |
|
| 45 |
@dataclass
|
|
@@ -55,6 +56,7 @@ class FullEvalResult:
|
|
| 55 |
results: List[EvalResult] # results on all the EvalResults over different tasks and metrics.
|
| 56 |
timestamp: str = ""
|
| 57 |
revision: str = ""
|
|
|
|
| 58 |
|
| 59 |
@classmethod
|
| 60 |
def init_from_json_file(cls, json_filepath):
|
|
@@ -87,7 +89,8 @@ class FullEvalResult:
|
|
| 87 |
task=config["task"],
|
| 88 |
metric=config["metric"],
|
| 89 |
timestamp=config.get("timestamp", "2024-05-12T12:24:02Z"),
|
| 90 |
-
revision=config.get("revision", "3a2ba9dcad796a48a02ca1147557724e")
|
|
|
|
| 91 |
)
|
| 92 |
result_list.append(eval_result)
|
| 93 |
return cls(
|
|
@@ -98,7 +101,8 @@ class FullEvalResult:
|
|
| 98 |
reranking_model_link=reranking_model_link,
|
| 99 |
results=result_list,
|
| 100 |
timestamp=result_list[0].timestamp,
|
| 101 |
-
revision=result_list[0].revision
|
|
|
|
| 102 |
)
|
| 103 |
|
| 104 |
def to_dict(self, task='qa', metric='ndcg_at_3') -> List:
|
|
|
|
| 40 |
metric: str
|
| 41 |
timestamp: str = "" # submission timestamp
|
| 42 |
revision: str = ""
|
| 43 |
+
is_anonymous: bool = False
|
| 44 |
|
| 45 |
|
| 46 |
@dataclass
|
|
|
|
| 56 |
results: List[EvalResult] # results on all the EvalResults over different tasks and metrics.
|
| 57 |
timestamp: str = ""
|
| 58 |
revision: str = ""
|
| 59 |
+
is_anonymous: bool = False
|
| 60 |
|
| 61 |
@classmethod
|
| 62 |
def init_from_json_file(cls, json_filepath):
|
|
|
|
| 89 |
task=config["task"],
|
| 90 |
metric=config["metric"],
|
| 91 |
timestamp=config.get("timestamp", "2024-05-12T12:24:02Z"),
|
| 92 |
+
revision=config.get("revision", "3a2ba9dcad796a48a02ca1147557724e"),
|
| 93 |
+
is_anonymous=config.get("is_anonymous", False)
|
| 94 |
)
|
| 95 |
result_list.append(eval_result)
|
| 96 |
return cls(
|
|
|
|
| 101 |
reranking_model_link=reranking_model_link,
|
| 102 |
results=result_list,
|
| 103 |
timestamp=result_list[0].timestamp,
|
| 104 |
+
revision=result_list[0].revision,
|
| 105 |
+
is_anonymous=result_list[0].is_anonymous
|
| 106 |
)
|
| 107 |
|
| 108 |
def to_dict(self, task='qa', metric='ndcg_at_3') -> List:
|
src/utils.py
CHANGED
|
@@ -59,7 +59,7 @@ def get_default_cols(task: str, columns: list = [], add_fix_cols: bool = True) -
|
|
| 59 |
for col_name, col_type in zip(cols_list, types_list):
|
| 60 |
if col_name not in benchmark_list:
|
| 61 |
continue
|
| 62 |
-
if columns and col_name not in columns:
|
| 63 |
continue
|
| 64 |
cols.append(col_name)
|
| 65 |
types.append(col_type)
|
|
@@ -178,7 +178,7 @@ def get_iso_format_timestamp():
|
|
| 178 |
return iso_format_timestamp, filename_friendly_timestamp
|
| 179 |
|
| 180 |
|
| 181 |
-
def submit_results(filepath: str, model: str, model_url: str, version: str = "AIR-Bench_24.04",
|
| 182 |
if not filepath.endswith(".zip"):
|
| 183 |
return styled_error(f"file uploading aborted. wrong file type: {filepath}")
|
| 184 |
|
|
@@ -218,7 +218,7 @@ def submit_results(filepath: str, model: str, model_url: str, version: str = "AI
|
|
| 218 |
"model_name": f"{model}",
|
| 219 |
"model_url": f"{model_url}",
|
| 220 |
"version": f"{version}",
|
| 221 |
-
"
|
| 222 |
"revision": f"{revision}",
|
| 223 |
"timestamp": f"{timestamp_config}"
|
| 224 |
}
|
|
|
|
| 59 |
for col_name, col_type in zip(cols_list, types_list):
|
| 60 |
if col_name not in benchmark_list:
|
| 61 |
continue
|
| 62 |
+
if len(columns) > 0 and col_name not in columns:
|
| 63 |
continue
|
| 64 |
cols.append(col_name)
|
| 65 |
types.append(col_type)
|
|
|
|
| 178 |
return iso_format_timestamp, filename_friendly_timestamp
|
| 179 |
|
| 180 |
|
| 181 |
+
def submit_results(filepath: str, model: str, model_url: str, version: str = "AIR-Bench_24.04", is_anonymous=False):
|
| 182 |
if not filepath.endswith(".zip"):
|
| 183 |
return styled_error(f"file uploading aborted. wrong file type: {filepath}")
|
| 184 |
|
|
|
|
| 218 |
"model_name": f"{model}",
|
| 219 |
"model_url": f"{model_url}",
|
| 220 |
"version": f"{version}",
|
| 221 |
+
"is_anonymous": f"{is_anonymous}",
|
| 222 |
"revision": f"{revision}",
|
| 223 |
"timestamp": f"{timestamp_config}"
|
| 224 |
}
|