geoalgo commited on
Commit
653989f
·
1 Parent(s): ca25c6f

update leaderboard

Browse files
Files changed (2) hide show
  1. main.py +40 -12
  2. results_instruction_tuning.csv.zip +2 -2
main.py CHANGED
@@ -13,21 +13,47 @@ df_core.drop("#Tokens", axis=1, inplace=True)
13
  df_core.drop("AVG", axis=1, inplace=True)
14
  benchmarks_core = df_core.columns[1:]
15
  df_core["Average ⬆️"] = df_core.loc[:, benchmarks_core].mean(axis=1)
16
-
17
 
18
  df_instruction_tuning = pd.read_csv("results_instruction_tuning.csv.zip")
19
- df_instruction_tuning = df_instruction_tuning.pivot_table(
20
  index="model_B", columns="benchmark", values="preference"
21
  )
22
- df_instruction_tuning.index.rename("Model", inplace=True)
23
- df_instruction_tuning.reset_index(drop=False, inplace=True)
24
- df_instruction_tuning.columns = [x.capitalize() for x in df_instruction_tuning.columns]
 
 
25
  # first column is model
26
- df_instruction_tuning["Average"] = df_instruction_tuning.loc[
27
- :, df_instruction_tuning.columns[1:]
28
  ].mean(axis=1)
29
  # df_instruction_tuning.drop("benchmark", axis=1, inplace=True)
 
 
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
  with gr.Blocks() as demo:
33
  gr.Markdown(
@@ -59,10 +85,12 @@ with gr.Blocks() as demo:
59
  """
60
  )
61
  Leaderboard(
62
- value=df_instruction_tuning.round(2),
63
  select_columns=SelectColumns(
64
  default_selection=[
65
- col for col in df_instruction_tuning.columns if not "-eu" in col
 
 
66
  ],
67
  cant_deselect=["Model"],
68
  label="Select Columns to Display:",
@@ -77,13 +105,13 @@ with gr.Blocks() as demo:
77
  with gr.Tab("Instruction-tuning multi-lingual 🎯🇪🇺"):
78
  gr.Markdown(
79
  """
80
- Winrate against Llama-3.1-8B-Instruct using Llama-3.1-70B-Instruct as the LLM-judge.
81
  """
82
  )
83
  Leaderboard(
84
- value=df_instruction_tuning.round(2),
85
  select_columns=SelectColumns(
86
- default_selection=list(df_instruction_tuning.columns),
87
  cant_deselect=["Model"],
88
  label="Select Columns to Display:",
89
  ),
 
13
  df_core.drop("AVG", axis=1, inplace=True)
14
  benchmarks_core = df_core.columns[1:]
15
  df_core["Average ⬆️"] = df_core.loc[:, benchmarks_core].mean(axis=1)
16
+ df_core.sort_values(by="Average ⬆️", ascending=False, inplace=True)
17
 
18
  df_instruction_tuning = pd.read_csv("results_instruction_tuning.csv.zip")
19
+ df_instruction_tuning_pivot = df_instruction_tuning.pivot_table(
20
  index="model_B", columns="benchmark", values="preference"
21
  )
22
+ df_instruction_tuning_pivot.index.rename("Model", inplace=True)
23
+ df_instruction_tuning_pivot.reset_index(drop=False, inplace=True)
24
+ df_instruction_tuning_pivot.columns = [
25
+ x.capitalize() for x in df_instruction_tuning_pivot.columns
26
+ ]
27
  # first column is model
28
+ df_instruction_tuning_pivot["Average ⬆️"] = df_instruction_tuning_pivot.loc[
29
+ :, df_instruction_tuning_pivot.columns[1:]
30
  ].mean(axis=1)
31
  # df_instruction_tuning.drop("benchmark", axis=1, inplace=True)
32
+ df_instruction_tuning_pivot.sort_values(by="Average ⬆️", ascending=False, inplace=True)
33
+
34
 
35
+ df_mah_pivot = df_instruction_tuning[
36
+ df_instruction_tuning.benchmark == "m-arena-hard-EU"
37
+ ].copy()
38
+ df_mah_pivot["lang"] = df_instruction_tuning.instruction_index.apply(
39
+ lambda s: s.split("-")[-1]
40
+ )
41
+
42
+ df_mah_pivot = df_mah_pivot.pivot_table(
43
+ index="model_B", columns="lang", values="preference"
44
+ )
45
+ df_mah_pivot["Average ⬆️"] = df_mah_pivot.mean(axis=1)
46
+ df_mah_pivot.sort_values(by="Average ⬆️", ascending=False, inplace=True)
47
+ df_mah_pivot.index.rename("Model", inplace=True)
48
+ df_mah_pivot.reset_index(drop=False, inplace=True)
49
+
50
+ cols = [
51
+ #'Llama-3.1-8B',
52
+ "Llama-3.1-Tulu-3-8B-SFT",
53
+ "Llama-3.2-3B-Instruct",
54
+ "Llama-3.1-Tulu-3-8B-DPO",
55
+ "Apertus-8B-Instruct-2509",
56
+ ]
57
 
58
  with gr.Blocks() as demo:
59
  gr.Markdown(
 
85
  """
86
  )
87
  Leaderboard(
88
+ value=df_instruction_tuning_pivot.round(2),
89
  select_columns=SelectColumns(
90
  default_selection=[
91
+ col
92
+ for col in df_instruction_tuning_pivot.columns
93
+ if not "-eu" in col
94
  ],
95
  cant_deselect=["Model"],
96
  label="Select Columns to Display:",
 
105
  with gr.Tab("Instruction-tuning multi-lingual 🎯🇪🇺"):
106
  gr.Markdown(
107
  """
108
+ Winrate on m-Arena-Hard instructions against Llama-3.1-8B-Instruct using Llama-3.1-70B-Instruct as the LLM-judge.
109
  """
110
  )
111
  Leaderboard(
112
+ value=df_mah_pivot.round(2),
113
  select_columns=SelectColumns(
114
+ default_selection=list(df_mah_pivot.columns),
115
  cant_deselect=["Model"],
116
  label="Select Columns to Display:",
117
  ),
results_instruction_tuning.csv.zip CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f76e51ad41bb386359abb58e10ea274cdd5189dfd468f5bb58850c61fb8c16f0
3
- size 209306
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:484a54e6946b58cdb8a76dd0bd0f48618905d8dd139b60de52f744c27eaf170d
3
+ size 258876