PyTorch
English
llama
richardmfan commited on
Commit
1720ac7
·
verified ·
1 Parent(s): fa8319a

Upload base_final

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +10 -0
  2. iter_1249000/config.json +30 -0
  3. iter_1249000/eval_results/arc_challenge_25shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T19-57-20.454173.json +125 -0
  4. iter_1249000/eval_results/arc_challenge_25shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_arc_challenge_2025-07-17T19-57-20.454173.jsonl +3 -0
  5. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T20-18-56.328280.json +0 -0
  6. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_boolean_expressions_2025-07-17T20-18-56.328280.jsonl +0 -0
  7. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_causal_judgement_2025-07-17T20-18-56.328280.jsonl +0 -0
  8. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_date_understanding_2025-07-17T20-18-56.328280.jsonl +0 -0
  9. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_disambiguation_qa_2025-07-17T20-18-56.328280.jsonl +0 -0
  10. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_dyck_languages_2025-07-17T20-18-56.328280.jsonl +0 -0
  11. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_formal_fallacies_2025-07-17T20-18-56.328280.jsonl +0 -0
  12. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_geometric_shapes_2025-07-17T20-18-56.328280.jsonl +0 -0
  13. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_hyperbaton_2025-07-17T20-18-56.328280.jsonl +0 -0
  14. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_logical_deduction_five_objects_2025-07-17T20-18-56.328280.jsonl +0 -0
  15. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_logical_deduction_seven_objects_2025-07-17T20-18-56.328280.jsonl +0 -0
  16. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_logical_deduction_three_objects_2025-07-17T20-18-56.328280.jsonl +0 -0
  17. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_movie_recommendation_2025-07-17T20-18-56.328280.jsonl +0 -0
  18. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_multistep_arithmetic_two_2025-07-17T20-18-56.328280.jsonl +0 -0
  19. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_navigate_2025-07-17T20-18-56.328280.jsonl +0 -0
  20. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_object_counting_2025-07-17T20-18-56.328280.jsonl +0 -0
  21. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_penguins_in_a_table_2025-07-17T20-18-56.328280.jsonl +0 -0
  22. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_reasoning_about_colored_objects_2025-07-17T20-18-56.328280.jsonl +0 -0
  23. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_ruin_names_2025-07-17T20-18-56.328280.jsonl +0 -0
  24. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_salient_translation_error_detection_2025-07-17T20-18-56.328280.jsonl +0 -0
  25. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_snarks_2025-07-17T20-18-56.328280.jsonl +0 -0
  26. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_sports_understanding_2025-07-17T20-18-56.328280.jsonl +0 -0
  27. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_temporal_sequences_2025-07-17T20-18-56.328280.jsonl +0 -0
  28. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_tracking_shuffled_objects_five_objects_2025-07-17T20-18-56.328280.jsonl +0 -0
  29. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_tracking_shuffled_objects_seven_objects_2025-07-17T20-18-56.328280.jsonl +0 -0
  30. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_tracking_shuffled_objects_three_objects_2025-07-17T20-18-56.328280.jsonl +0 -0
  31. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_web_of_lies_2025-07-17T20-18-56.328280.jsonl +0 -0
  32. iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_word_sorting_2025-07-17T20-18-56.328280.jsonl +0 -0
  33. iter_1249000/eval_results/gsm8k_5shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T20-44-20.876997.json +161 -0
  34. iter_1249000/eval_results/gsm8k_5shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_gsm8k_2025-07-17T20-44-20.876997.jsonl +3 -0
  35. iter_1249000/eval_results/gsm8k_cot_8shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-08-07T20-33-19.023288.json +195 -0
  36. iter_1249000/eval_results/gsm8k_cot_8shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_gsm8k_cot_2025-08-07T20-33-19.023288.jsonl +0 -0
  37. iter_1249000/eval_results/gsm8k_reasoning_base_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__/results_2025-11-26T00-59-11.177986.json +130 -0
  38. iter_1249000/eval_results/gsm8k_reasoning_base_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__/samples_gsm8k_reasoning_base_2025-11-26T00-59-11.177986.jsonl +3 -0
  39. iter_1249000/eval_results/hellaswag_10shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T23-58-29.900269.json +126 -0
  40. iter_1249000/eval_results/hellaswag_10shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_hellaswag_2025-07-17T23-58-29.900269.jsonl +3 -0
  41. iter_1249000/eval_results/humaneval_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T19-26-52.816826.json +134 -0
  42. iter_1249000/eval_results/humaneval_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_humaneval_2025-07-17T19-26-52.816826.jsonl +0 -0
  43. iter_1249000/eval_results/humaneval_64_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-08-22T03-51-05.521361.json +148 -0
  44. iter_1249000/eval_results/humaneval_64_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_humaneval_64_2025-08-22T03-51-05.521361.jsonl +3 -0
  45. iter_1249000/eval_results/humaneval_64_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__/results_2025-11-25T01-35-23.860829.json +148 -0
  46. iter_1249000/eval_results/humaneval_64_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__/samples_humaneval_64_2025-11-25T01-35-23.860829.jsonl +3 -0
  47. iter_1249000/eval_results/ifeval_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T20-13-52.015185.json +140 -0
  48. iter_1249000/eval_results/ifeval_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_ifeval_2025-07-17T20-13-52.015185.jsonl +0 -0
  49. iter_1249000/eval_results/leaderboard_gpqa_diamond/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T20-26-10.269030.json +121 -0
  50. iter_1249000/eval_results/leaderboard_gpqa_diamond/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_leaderboard_gpqa_diamond_2025-07-17T20-26-10.269030.jsonl +0 -0
.gitattributes CHANGED
@@ -36,3 +36,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
  250k/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
  figures/sft-models.png filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
36
  tokenizer.json filter=lfs diff=lfs merge=lfs -text
37
  250k/tokenizer.json filter=lfs diff=lfs merge=lfs -text
38
  figures/sft-models.png filter=lfs diff=lfs merge=lfs -text
39
+ iter_1249000/eval_results/arc_challenge_25shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_arc_challenge_2025-07-17T19-57-20.454173.jsonl filter=lfs diff=lfs merge=lfs -text
40
+ iter_1249000/eval_results/gsm8k_5shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_gsm8k_2025-07-17T20-44-20.876997.jsonl filter=lfs diff=lfs merge=lfs -text
41
+ iter_1249000/eval_results/gsm8k_reasoning_base_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__/samples_gsm8k_reasoning_base_2025-11-26T00-59-11.177986.jsonl filter=lfs diff=lfs merge=lfs -text
42
+ iter_1249000/eval_results/hellaswag_10shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_hellaswag_2025-07-17T23-58-29.900269.jsonl filter=lfs diff=lfs merge=lfs -text
43
+ iter_1249000/eval_results/humaneval_64_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_humaneval_64_2025-08-22T03-51-05.521361.jsonl filter=lfs diff=lfs merge=lfs -text
44
+ iter_1249000/eval_results/humaneval_64_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__/samples_humaneval_64_2025-11-25T01-35-23.860829.jsonl filter=lfs diff=lfs merge=lfs -text
45
+ iter_1249000/eval_results/minerva_math500_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_minerva_math500_2025-11-28T05-06-05.219253.jsonl filter=lfs diff=lfs merge=lfs -text
46
+ iter_1249000/eval_results/mmlu_5shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_mmlu_professional_law_2025-07-18T00-49-44.450953.jsonl filter=lfs diff=lfs merge=lfs -text
47
+ iter_1249000/eval_results/mmlu_pro_5shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_mmlu_pro_law_2025-07-17T21-35-56.511842.jsonl filter=lfs diff=lfs merge=lfs -text
48
+ iter_1249000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
iter_1249000/config.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "LlamaForCausalLM"
4
+ ],
5
+ "attention_bias": false,
6
+ "attention_dropout": 0.0,
7
+ "bos_token_id": 0,
8
+ "eos_token_id": 1,
9
+ "head_dim": 128,
10
+ "hidden_act": "silu",
11
+ "hidden_size": 8192,
12
+ "initializer_range": 0.02,
13
+ "intermediate_size": 28672,
14
+ "max_position_embeddings": 32768,
15
+ "mlp_bias": false,
16
+ "model_type": "llama",
17
+ "num_attention_heads": 64,
18
+ "num_hidden_layers": 80,
19
+ "num_key_value_heads": 8,
20
+ "num_local_experts": null,
21
+ "pretraining_tp": 1,
22
+ "rms_norm_eps": 1e-05,
23
+ "rope_scaling": null,
24
+ "rope_theta": 500000,
25
+ "tie_word_embeddings": false,
26
+ "torch_dtype": "float32",
27
+ "transformers_version": "4.53.0.dev0",
28
+ "use_cache": true,
29
+ "vocab_size": 250112
30
+ }
iter_1249000/eval_results/arc_challenge_25shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T19-57-20.454173.json ADDED
@@ -0,0 +1,125 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "arc_challenge": {
4
+ "alias": "arc_challenge",
5
+ "acc,none": 0.6083617747440273,
6
+ "acc_stderr,none": 0.014264122124938267,
7
+ "acc_norm,none": 0.6484641638225256,
8
+ "acc_norm_stderr,none": 0.013952413699601044
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "arc_challenge": []
13
+ },
14
+ "configs": {
15
+ "arc_challenge": {
16
+ "task": "arc_challenge",
17
+ "tag": [
18
+ "ai2_arc"
19
+ ],
20
+ "dataset_path": "allenai/ai2_arc",
21
+ "dataset_name": "ARC-Challenge",
22
+ "training_split": "train",
23
+ "validation_split": "validation",
24
+ "test_split": "test",
25
+ "doc_to_text": "Question: {{question}}\nAnswer:",
26
+ "doc_to_target": "{{choices.label.index(answerKey)}}",
27
+ "unsafe_code": false,
28
+ "doc_to_choice": "{{choices.text}}",
29
+ "description": "",
30
+ "target_delimiter": " ",
31
+ "fewshot_delimiter": "\n\n",
32
+ "num_fewshot": 25,
33
+ "metric_list": [
34
+ {
35
+ "metric": "acc",
36
+ "aggregation": "mean",
37
+ "higher_is_better": true
38
+ },
39
+ {
40
+ "metric": "acc_norm",
41
+ "aggregation": "mean",
42
+ "higher_is_better": true
43
+ }
44
+ ],
45
+ "output_type": "multiple_choice",
46
+ "repeats": 1,
47
+ "should_decontaminate": true,
48
+ "doc_to_decontamination_query": "Question: {{question}}\nAnswer:",
49
+ "metadata": {
50
+ "version": 1.0,
51
+ "pretrained": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
52
+ "tensor_parallel_size": 8,
53
+ "dtype": "float32",
54
+ "gpu_memory_utilization": 0.8
55
+ }
56
+ }
57
+ },
58
+ "versions": {
59
+ "arc_challenge": 1.0
60
+ },
61
+ "n-shot": {
62
+ "arc_challenge": 25
63
+ },
64
+ "higher_is_better": {
65
+ "arc_challenge": {
66
+ "acc": true,
67
+ "acc_norm": true
68
+ }
69
+ },
70
+ "n-samples": {
71
+ "arc_challenge": {
72
+ "original": 1172,
73
+ "effective": 1172
74
+ }
75
+ },
76
+ "config": {
77
+ "model": "vllm",
78
+ "model_args": "pretrained=/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000,tensor_parallel_size=8,dtype=float32,gpu_memory_utilization=0.8",
79
+ "batch_size": "1",
80
+ "batch_sizes": [],
81
+ "device": null,
82
+ "use_cache": null,
83
+ "limit": null,
84
+ "bootstrap_iters": 100000,
85
+ "gen_kwargs": null,
86
+ "random_seed": 0,
87
+ "numpy_seed": 1234,
88
+ "torch_seed": 1234,
89
+ "fewshot_seed": 1234
90
+ },
91
+ "git_hash": "a445a07",
92
+ "date": 1752779715.342989,
93
+ "pretty_env_info": "PyTorch version: 2.7.0+cu126\nIs debug build: False\nCUDA used to build PyTorch: 12.6\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.31.7\nLibc version: glibc-2.35\n\nPython version: 3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:12:24) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1088-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H200\nGPU 1: NVIDIA H200\nGPU 2: NVIDIA H200\nGPU 3: NVIDIA H200\nGPU 4: NVIDIA H200\nGPU 5: NVIDIA H200\nGPU 6: NVIDIA H200\nGPU 7: NVIDIA H200\n\nNvidia driver version: 570.133.20\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8480C\nCPU family: 6\nModel: 143\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 8\nBogoMIPS: 4000.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 avx512vbmi umip waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid cldemote movdiri movdir64b fsrm serialize amx_bf16 avx512_fp16 amx_tile amx_int8 arch_capabilities\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 4.5 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 192 MiB (96 instances)\nL3 cache: 210 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47\nNUMA node1 CPU(s): 48-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Unknown: No mitigations\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Vulnerable\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Retpoline\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] flake8==7.0.0\n[pip3] flashinfer-python==0.2.5+cu126torch2.6\n[pip3] mypy==1.10.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] numpydoc==1.7.0\n[pip3] nvidia-cublas-cu12==12.6.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.6.80\n[pip3] nvidia-cuda-nvrtc-cu12==12.6.77\n[pip3] nvidia-cuda-runtime-cu12==12.6.77\n[pip3] nvidia-cudnn-cu12==9.5.1.17\n[pip3] nvidia-cufft-cu12==11.3.0.4\n[pip3] nvidia-curand-cu12==10.3.7.77\n[pip3] nvidia-cusolver-cu12==11.7.1.2\n[pip3] nvidia-cusparse-cu12==12.5.4.2\n[pip3] nvidia-cusparselt-cu12==0.6.3\n[pip3] nvidia-nccl-cu12==2.26.2\n[pip3] nvidia-nvjitlink-cu12==12.6.85\n[pip3] nvidia-nvtx-cu12==12.6.77\n[pip3] torch==2.7.0\n[pip3] torchaudio==2.7.0\n[pip3] torchvision==0.22.0\n[pip3] triton==3.3.0\n[conda] _anaconda_depends 2024.06 py312_mkl_2 \n[conda] blas 1.0 mkl \n[conda] flashinfer-python 0.2.5+cu126torch2.6 pypi_0 pypi\n[conda] mkl 2023.1.0 h213fc3f_46344 \n[conda] mkl-service 2.4.0 py312h5eee18b_1 \n[conda] mkl_fft 1.3.8 py312h5eee18b_0 \n[conda] mkl_random 1.2.4 py312hdb19cb5_0 \n[conda] numpy 1.26.4 py312hc5e2394_0 \n[conda] numpy-base 1.26.4 py312h0da6c21_0 \n[conda] numpydoc 1.7.0 py312h06a4308_0 \n[conda] nvidia-cublas-cu12 12.6.4.1 pypi_0 pypi\n[conda] nvidia-cuda-cupti-cu12 12.6.80 pypi_0 pypi\n[conda] nvidia-cuda-nvrtc-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cuda-runtime-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cudnn-cu12 9.5.1.17 pypi_0 pypi\n[conda] nvidia-cufft-cu12 11.3.0.4 pypi_0 pypi\n[conda] nvidia-curand-cu12 10.3.7.77 pypi_0 pypi\n[conda] nvidia-cusolver-cu12 11.7.1.2 pypi_0 pypi\n[conda] nvidia-cusparse-cu12 12.5.4.2 pypi_0 pypi\n[conda] nvidia-cusparselt-cu12 0.6.3 pypi_0 pypi\n[conda] nvidia-nccl-cu12 2.26.2 pypi_0 pypi\n[conda] nvidia-nvjitlink-cu12 12.6.85 pypi_0 pypi\n[conda] nvidia-nvtx-cu12 12.6.77 pypi_0 pypi\n[conda] torch 2.7.0 pypi_0 pypi\n[conda] torchaudio 2.7.0 pypi_0 pypi\n[conda] torchvision 0.22.0 pypi_0 pypi\n[conda] triton 3.3.0 pypi_0 pypi",
94
+ "transformers_version": "4.53.0.dev0",
95
+ "lm_eval_version": "0.4.8",
96
+ "upper_git_hash": null,
97
+ "tokenizer_pad_token": [
98
+ "<|end_of_text|>",
99
+ "1"
100
+ ],
101
+ "tokenizer_eos_token": [
102
+ "<|end_of_text|>",
103
+ "1"
104
+ ],
105
+ "tokenizer_bos_token": [
106
+ "<|begin_of_text|>",
107
+ "0"
108
+ ],
109
+ "eot_token_id": 1,
110
+ "max_length": 32768,
111
+ "task_hashes": {
112
+ "arc_challenge": "55e883475b5650b20d8d9dc1e9cdf59ef645a257fcd74bf43a9dbb5c632c529c"
113
+ },
114
+ "model_source": "vllm",
115
+ "model_name": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
116
+ "model_name_sanitized": "__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000",
117
+ "system_instruction": null,
118
+ "system_instruction_sha": null,
119
+ "fewshot_as_multiturn": false,
120
+ "chat_template": null,
121
+ "chat_template_sha": null,
122
+ "start_time": 1470957.02656259,
123
+ "end_time": 1473500.429686519,
124
+ "total_evaluation_time_seconds": "2543.4031239291653"
125
+ }
iter_1249000/eval_results/arc_challenge_25shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_arc_challenge_2025-07-17T19-57-20.454173.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ba6a3cf4ac05ff39bae31ead5ab97e287408840565822eebb65b974ed78fb874
3
+ size 23433988
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T20-18-56.328280.json ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_boolean_expressions_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_causal_judgement_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_date_understanding_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_disambiguation_qa_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_dyck_languages_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_formal_fallacies_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_geometric_shapes_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_hyperbaton_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_logical_deduction_five_objects_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_logical_deduction_seven_objects_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_logical_deduction_three_objects_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_movie_recommendation_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_multistep_arithmetic_two_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_navigate_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_object_counting_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_penguins_in_a_table_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_reasoning_about_colored_objects_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_ruin_names_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_salient_translation_error_detection_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_snarks_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_sports_understanding_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_temporal_sequences_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_tracking_shuffled_objects_five_objects_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_tracking_shuffled_objects_seven_objects_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_tracking_shuffled_objects_three_objects_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_web_of_lies_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_word_sorting_2025-07-17T20-18-56.328280.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/gsm8k_5shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T20-44-20.876997.json ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "gsm8k": {
4
+ "alias": "gsm8k",
5
+ "exact_match,strict-match": 0.6785443517816527,
6
+ "exact_match_stderr,strict-match": 0.012864471384836705,
7
+ "exact_match,flexible-extract": 0.6800606520090978,
8
+ "exact_match_stderr,flexible-extract": 0.012848426555240763
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "gsm8k": []
13
+ },
14
+ "configs": {
15
+ "gsm8k": {
16
+ "task": "gsm8k",
17
+ "tag": [
18
+ "math_word_problems"
19
+ ],
20
+ "dataset_path": "gsm8k",
21
+ "dataset_name": "main",
22
+ "training_split": "train",
23
+ "test_split": "test",
24
+ "fewshot_split": "train",
25
+ "doc_to_text": "Question: {{question}}\nAnswer:",
26
+ "doc_to_target": "{{answer}}",
27
+ "unsafe_code": false,
28
+ "description": "",
29
+ "target_delimiter": " ",
30
+ "fewshot_delimiter": "\n\n",
31
+ "num_fewshot": 5,
32
+ "metric_list": [
33
+ {
34
+ "metric": "exact_match",
35
+ "aggregation": "mean",
36
+ "higher_is_better": true,
37
+ "ignore_case": true,
38
+ "ignore_punctuation": false,
39
+ "regexes_to_ignore": [
40
+ ",",
41
+ "\\$",
42
+ "(?s).*#### ",
43
+ "\\.$"
44
+ ]
45
+ }
46
+ ],
47
+ "output_type": "generate_until",
48
+ "generation_kwargs": {
49
+ "until": [
50
+ "Question:",
51
+ "</s>",
52
+ "<|im_end|>"
53
+ ],
54
+ "do_sample": false,
55
+ "temperature": 0.0
56
+ },
57
+ "repeats": 1,
58
+ "filter_list": [
59
+ {
60
+ "name": "strict-match",
61
+ "filter": [
62
+ {
63
+ "function": "regex",
64
+ "regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
65
+ },
66
+ {
67
+ "function": "take_first"
68
+ }
69
+ ]
70
+ },
71
+ {
72
+ "name": "flexible-extract",
73
+ "filter": [
74
+ {
75
+ "function": "regex",
76
+ "group_select": -1,
77
+ "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
78
+ },
79
+ {
80
+ "function": "take_first"
81
+ }
82
+ ]
83
+ }
84
+ ],
85
+ "should_decontaminate": false,
86
+ "metadata": {
87
+ "version": 3.0,
88
+ "pretrained": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
89
+ "tensor_parallel_size": 8,
90
+ "dtype": "float32",
91
+ "gpu_memory_utilization": 0.8
92
+ }
93
+ }
94
+ },
95
+ "versions": {
96
+ "gsm8k": 3.0
97
+ },
98
+ "n-shot": {
99
+ "gsm8k": 5
100
+ },
101
+ "higher_is_better": {
102
+ "gsm8k": {
103
+ "exact_match": true
104
+ }
105
+ },
106
+ "n-samples": {
107
+ "gsm8k": {
108
+ "original": 1319,
109
+ "effective": 1319
110
+ }
111
+ },
112
+ "config": {
113
+ "model": "vllm",
114
+ "model_args": "pretrained=/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000,tensor_parallel_size=8,dtype=float32,gpu_memory_utilization=0.8",
115
+ "batch_size": "1",
116
+ "batch_sizes": [],
117
+ "device": null,
118
+ "use_cache": null,
119
+ "limit": null,
120
+ "bootstrap_iters": 100000,
121
+ "gen_kwargs": null,
122
+ "random_seed": 0,
123
+ "numpy_seed": 1234,
124
+ "torch_seed": 1234,
125
+ "fewshot_seed": 1234
126
+ },
127
+ "git_hash": "a445a07",
128
+ "date": 1752782267.9471405,
129
+ "pretty_env_info": "PyTorch version: 2.7.0+cu126\nIs debug build: False\nCUDA used to build PyTorch: 12.6\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.31.7\nLibc version: glibc-2.35\n\nPython version: 3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:12:24) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1088-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H200\nGPU 1: NVIDIA H200\nGPU 2: NVIDIA H200\nGPU 3: NVIDIA H200\nGPU 4: NVIDIA H200\nGPU 5: NVIDIA H200\nGPU 6: NVIDIA H200\nGPU 7: NVIDIA H200\n\nNvidia driver version: 570.133.20\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8480C\nCPU family: 6\nModel: 143\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 8\nBogoMIPS: 4000.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 avx512vbmi umip waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid cldemote movdiri movdir64b fsrm serialize amx_bf16 avx512_fp16 amx_tile amx_int8 arch_capabilities\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 4.5 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 192 MiB (96 instances)\nL3 cache: 210 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47\nNUMA node1 CPU(s): 48-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Unknown: No mitigations\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Vulnerable\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Retpoline\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] flake8==7.0.0\n[pip3] flashinfer-python==0.2.5+cu126torch2.6\n[pip3] mypy==1.10.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] numpydoc==1.7.0\n[pip3] nvidia-cublas-cu12==12.6.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.6.80\n[pip3] nvidia-cuda-nvrtc-cu12==12.6.77\n[pip3] nvidia-cuda-runtime-cu12==12.6.77\n[pip3] nvidia-cudnn-cu12==9.5.1.17\n[pip3] nvidia-cufft-cu12==11.3.0.4\n[pip3] nvidia-curand-cu12==10.3.7.77\n[pip3] nvidia-cusolver-cu12==11.7.1.2\n[pip3] nvidia-cusparse-cu12==12.5.4.2\n[pip3] nvidia-cusparselt-cu12==0.6.3\n[pip3] nvidia-nccl-cu12==2.26.2\n[pip3] nvidia-nvjitlink-cu12==12.6.85\n[pip3] nvidia-nvtx-cu12==12.6.77\n[pip3] torch==2.7.0\n[pip3] torchaudio==2.7.0\n[pip3] torchvision==0.22.0\n[pip3] triton==3.3.0\n[conda] _anaconda_depends 2024.06 py312_mkl_2 \n[conda] blas 1.0 mkl \n[conda] flashinfer-python 0.2.5+cu126torch2.6 pypi_0 pypi\n[conda] mkl 2023.1.0 h213fc3f_46344 \n[conda] mkl-service 2.4.0 py312h5eee18b_1 \n[conda] mkl_fft 1.3.8 py312h5eee18b_0 \n[conda] mkl_random 1.2.4 py312hdb19cb5_0 \n[conda] numpy 1.26.4 py312hc5e2394_0 \n[conda] numpy-base 1.26.4 py312h0da6c21_0 \n[conda] numpydoc 1.7.0 py312h06a4308_0 \n[conda] nvidia-cublas-cu12 12.6.4.1 pypi_0 pypi\n[conda] nvidia-cuda-cupti-cu12 12.6.80 pypi_0 pypi\n[conda] nvidia-cuda-nvrtc-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cuda-runtime-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cudnn-cu12 9.5.1.17 pypi_0 pypi\n[conda] nvidia-cufft-cu12 11.3.0.4 pypi_0 pypi\n[conda] nvidia-curand-cu12 10.3.7.77 pypi_0 pypi\n[conda] nvidia-cusolver-cu12 11.7.1.2 pypi_0 pypi\n[conda] nvidia-cusparse-cu12 12.5.4.2 pypi_0 pypi\n[conda] nvidia-cusparselt-cu12 0.6.3 pypi_0 pypi\n[conda] nvidia-nccl-cu12 2.26.2 pypi_0 pypi\n[conda] nvidia-nvjitlink-cu12 12.6.85 pypi_0 pypi\n[conda] nvidia-nvtx-cu12 12.6.77 pypi_0 pypi\n[conda] torch 2.7.0 pypi_0 pypi\n[conda] torchaudio 2.7.0 pypi_0 pypi\n[conda] torchvision 0.22.0 pypi_0 pypi\n[conda] triton 3.3.0 pypi_0 pypi",
130
+ "transformers_version": "4.53.0.dev0",
131
+ "lm_eval_version": "0.4.8",
132
+ "upper_git_hash": null,
133
+ "tokenizer_pad_token": [
134
+ "<|end_of_text|>",
135
+ "1"
136
+ ],
137
+ "tokenizer_eos_token": [
138
+ "<|end_of_text|>",
139
+ "1"
140
+ ],
141
+ "tokenizer_bos_token": [
142
+ "<|begin_of_text|>",
143
+ "0"
144
+ ],
145
+ "eot_token_id": 1,
146
+ "max_length": 32768,
147
+ "task_hashes": {
148
+ "gsm8k": "2330f4ebfcccaf66a892922df2819cdb1f118e448d076d3f42bdde4177678ac7"
149
+ },
150
+ "model_source": "vllm",
151
+ "model_name": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
152
+ "model_name_sanitized": "__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000",
153
+ "system_instruction": null,
154
+ "system_instruction_sha": null,
155
+ "fewshot_as_multiturn": false,
156
+ "chat_template": null,
157
+ "chat_template_sha": null,
158
+ "start_time": 1473522.659870797,
159
+ "end_time": 1476320.847624145,
160
+ "total_evaluation_time_seconds": "2798.187753347913"
161
+ }
iter_1249000/eval_results/gsm8k_5shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_gsm8k_2025-07-17T20-44-20.876997.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:356b69e32a7269557dd4d9232d1880291c698fe6fa684df598cb151b91c1a312
3
+ size 12375232
iter_1249000/eval_results/gsm8k_cot_8shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-08-07T20-33-19.023288.json ADDED
@@ -0,0 +1,195 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "gsm8k_cot": {
4
+ "alias": "gsm8k_cot",
5
+ "exact_match,strict-match": 0.5852918877937832,
6
+ "exact_match_stderr,strict-match": 0.01357062384230451,
7
+ "exact_match,flexible-extract": 0.6322971948445792,
8
+ "exact_match_stderr,flexible-extract": 0.01328163050339548
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "gsm8k_cot": []
13
+ },
14
+ "configs": {
15
+ "gsm8k_cot": {
16
+ "task": "gsm8k_cot",
17
+ "tag": [
18
+ "chain_of_thought"
19
+ ],
20
+ "dataset_path": "gsm8k",
21
+ "dataset_name": "main",
22
+ "test_split": "test",
23
+ "doc_to_text": "Q: {{question}}\nA:",
24
+ "doc_to_target": "{{answer.split('####')[-1].strip() if answer is defined else target}}",
25
+ "unsafe_code": false,
26
+ "description": "",
27
+ "target_delimiter": " ",
28
+ "fewshot_delimiter": "\n\n",
29
+ "fewshot_config": {
30
+ "sampler": "first_n",
31
+ "samples": [
32
+ {
33
+ "question": "There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?",
34
+ "target": "There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6."
35
+ },
36
+ {
37
+ "question": "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?",
38
+ "target": "There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5."
39
+ },
40
+ {
41
+ "question": "Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?",
42
+ "target": "Originally, Leah had 32 chocolates. Her sister had 42. So in total they had 32 + 42 = 74. After eating 35, they had 74 - 35 = 39. The answer is 39."
43
+ },
44
+ {
45
+ "question": "Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?",
46
+ "target": "Jason started with 20 lollipops. Then he had 12 after giving some to Denny. So he gave Denny 20 - 12 = 8. The answer is 8."
47
+ },
48
+ {
49
+ "question": "Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?",
50
+ "target": "Shawn started with 5 toys. If he got 2 toys each from his mom and dad, then that is 4 more toys. 5 + 4 = 9. The answer is 9."
51
+ },
52
+ {
53
+ "question": "There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?",
54
+ "target": "There were originally 9 computers. For each of 4 days, 5 more computers were added. So 5 * 4 = 20 computers were added. 9 + 20 is 29. The answer is 29."
55
+ },
56
+ {
57
+ "question": "Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?",
58
+ "target": "Michael started with 58 golf balls. After losing 23 on tuesday, he had 58 - 23 = 35. After losing 2 more, he had 35 - 2 = 33 golf balls. The answer is 33."
59
+ },
60
+ {
61
+ "question": "Olivia has $23. She bought five bagels for $3 each. How much money does she have left?",
62
+ "target": "Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15 dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The answer is 8."
63
+ }
64
+ ]
65
+ },
66
+ "num_fewshot": 8,
67
+ "metric_list": [
68
+ {
69
+ "aggregation": "mean",
70
+ "higher_is_better": true,
71
+ "ignore_case": true,
72
+ "ignore_punctuation": false,
73
+ "metric": "exact_match",
74
+ "regexes_to_ignore": [
75
+ ",",
76
+ "\\$",
77
+ "(?s).*#### ",
78
+ "\\.$"
79
+ ]
80
+ }
81
+ ],
82
+ "output_type": "generate_until",
83
+ "generation_kwargs": {
84
+ "do_sample": false,
85
+ "until": [
86
+ "Q:",
87
+ "</s>",
88
+ "<|im_end|>"
89
+ ]
90
+ },
91
+ "repeats": 1,
92
+ "filter_list": [
93
+ {
94
+ "filter": [
95
+ {
96
+ "function": "regex",
97
+ "regex_pattern": "The answer is (\\-?[0-9\\.\\,]+)."
98
+ },
99
+ {
100
+ "function": "take_first"
101
+ }
102
+ ],
103
+ "name": "strict-match"
104
+ },
105
+ {
106
+ "filter": [
107
+ {
108
+ "function": "regex",
109
+ "group_select": -1,
110
+ "regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
111
+ },
112
+ {
113
+ "function": "take_first"
114
+ }
115
+ ],
116
+ "name": "flexible-extract"
117
+ }
118
+ ],
119
+ "should_decontaminate": false,
120
+ "metadata": {
121
+ "version": 3.0,
122
+ "pretrained": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
123
+ "tensor_parallel_size": 8,
124
+ "dtype": "float32",
125
+ "gpu_memory_utilization": 0.7
126
+ }
127
+ }
128
+ },
129
+ "versions": {
130
+ "gsm8k_cot": 3.0
131
+ },
132
+ "n-shot": {
133
+ "gsm8k_cot": 8
134
+ },
135
+ "higher_is_better": {
136
+ "gsm8k_cot": {
137
+ "exact_match": true
138
+ }
139
+ },
140
+ "n-samples": {
141
+ "gsm8k_cot": {
142
+ "original": 1319,
143
+ "effective": 1319
144
+ }
145
+ },
146
+ "config": {
147
+ "model": "vllm",
148
+ "model_args": "pretrained=/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000,tensor_parallel_size=8,dtype=float32,gpu_memory_utilization=0.7",
149
+ "batch_size": "1",
150
+ "batch_sizes": [],
151
+ "device": null,
152
+ "use_cache": null,
153
+ "limit": null,
154
+ "bootstrap_iters": 100000,
155
+ "gen_kwargs": null,
156
+ "random_seed": 0,
157
+ "numpy_seed": 1234,
158
+ "torch_seed": 1234,
159
+ "fewshot_seed": 1234
160
+ },
161
+ "git_hash": "18965e2",
162
+ "date": 1754596227.0581243,
163
+ "pretty_env_info": "PyTorch version: 2.7.0+cu126\nIs debug build: False\nCUDA used to build PyTorch: 12.6\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.31.7\nLibc version: glibc-2.35\n\nPython version: 3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:12:24) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1088-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H200\nGPU 1: NVIDIA H200\nGPU 2: NVIDIA H200\nGPU 3: NVIDIA H200\nGPU 4: NVIDIA H200\nGPU 5: NVIDIA H200\nGPU 6: NVIDIA H200\nGPU 7: NVIDIA H200\n\nNvidia driver version: 570.133.20\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8480C\nCPU family: 6\nModel: 143\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 8\nBogoMIPS: 4000.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 avx512vbmi umip waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid cldemote movdiri movdir64b fsrm serialize amx_bf16 avx512_fp16 amx_tile amx_int8 arch_capabilities\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 4.5 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 192 MiB (96 instances)\nL3 cache: 210 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47\nNUMA node1 CPU(s): 48-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Unknown: No mitigations\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Vulnerable\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Retpoline\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] flake8==7.0.0\n[pip3] flashinfer-python==0.2.5+cu126torch2.6\n[pip3] mypy==1.10.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] numpydoc==1.7.0\n[pip3] nvidia-cublas-cu12==12.6.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.6.80\n[pip3] nvidia-cuda-nvrtc-cu12==12.6.77\n[pip3] nvidia-cuda-runtime-cu12==12.6.77\n[pip3] nvidia-cudnn-cu12==9.5.1.17\n[pip3] nvidia-cufft-cu12==11.3.0.4\n[pip3] nvidia-curand-cu12==10.3.7.77\n[pip3] nvidia-cusolver-cu12==11.7.1.2\n[pip3] nvidia-cusparse-cu12==12.5.4.2\n[pip3] nvidia-cusparselt-cu12==0.6.3\n[pip3] nvidia-nccl-cu12==2.26.2\n[pip3] nvidia-nvjitlink-cu12==12.6.85\n[pip3] nvidia-nvtx-cu12==12.6.77\n[pip3] torch==2.7.0\n[pip3] torchaudio==2.7.0\n[pip3] torchvision==0.22.0\n[pip3] triton==3.3.0\n[conda] _anaconda_depends 2024.06 py312_mkl_2 \n[conda] blas 1.0 mkl \n[conda] flashinfer-python 0.2.5+cu126torch2.6 pypi_0 pypi\n[conda] mkl 2023.1.0 h213fc3f_46344 \n[conda] mkl-service 2.4.0 py312h5eee18b_1 \n[conda] mkl_fft 1.3.8 py312h5eee18b_0 \n[conda] mkl_random 1.2.4 py312hdb19cb5_0 \n[conda] numpy 1.26.4 py312hc5e2394_0 \n[conda] numpy-base 1.26.4 py312h0da6c21_0 \n[conda] numpydoc 1.7.0 py312h06a4308_0 \n[conda] nvidia-cublas-cu12 12.6.4.1 pypi_0 pypi\n[conda] nvidia-cuda-cupti-cu12 12.6.80 pypi_0 pypi\n[conda] nvidia-cuda-nvrtc-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cuda-runtime-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cudnn-cu12 9.5.1.17 pypi_0 pypi\n[conda] nvidia-cufft-cu12 11.3.0.4 pypi_0 pypi\n[conda] nvidia-curand-cu12 10.3.7.77 pypi_0 pypi\n[conda] nvidia-cusolver-cu12 11.7.1.2 pypi_0 pypi\n[conda] nvidia-cusparse-cu12 12.5.4.2 pypi_0 pypi\n[conda] nvidia-cusparselt-cu12 0.6.3 pypi_0 pypi\n[conda] nvidia-nccl-cu12 2.26.2 pypi_0 pypi\n[conda] nvidia-nvjitlink-cu12 12.6.85 pypi_0 pypi\n[conda] nvidia-nvtx-cu12 12.6.77 pypi_0 pypi\n[conda] torch 2.7.0 pypi_0 pypi\n[conda] torchaudio 2.7.0 pypi_0 pypi\n[conda] torchvision 0.22.0 pypi_0 pypi\n[conda] triton 3.3.0 pypi_0 pypi",
164
+ "transformers_version": "4.53.0.dev0",
165
+ "lm_eval_version": "0.4.8",
166
+ "upper_git_hash": null,
167
+ "tokenizer_pad_token": [
168
+ "<|end_of_text|>",
169
+ "1"
170
+ ],
171
+ "tokenizer_eos_token": [
172
+ "<|end_of_text|>",
173
+ "1"
174
+ ],
175
+ "tokenizer_bos_token": [
176
+ "<|begin_of_text|>",
177
+ "0"
178
+ ],
179
+ "eot_token_id": 1,
180
+ "max_length": 32768,
181
+ "task_hashes": {
182
+ "gsm8k_cot": "fc360963b39ee52c26a82795124f9ad7da4d6a8fecf1b77e2502823b1669b3d0"
183
+ },
184
+ "model_source": "vllm",
185
+ "model_name": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
186
+ "model_name_sanitized": "__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000",
187
+ "system_instruction": null,
188
+ "system_instruction_sha": null,
189
+ "fewshot_as_multiturn": false,
190
+ "chat_template": null,
191
+ "chat_template_sha": null,
192
+ "start_time": 3287464.513322936,
193
+ "end_time": 3290055.302254567,
194
+ "total_evaluation_time_seconds": "2590.7889316310175"
195
+ }
iter_1249000/eval_results/gsm8k_cot_8shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_gsm8k_cot_2025-08-07T20-33-19.023288.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/gsm8k_reasoning_base_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__/results_2025-11-26T00-59-11.177986.json ADDED
@@ -0,0 +1,130 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "gsm8k_reasoning_base": {
4
+ "alias": "gsm8k_reasoning_base",
5
+ "math_verify,none": 0.07808946171341925,
6
+ "math_verify_stderr,none": 0.007390654481108267
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "gsm8k_reasoning_base": []
11
+ },
12
+ "configs": {
13
+ "gsm8k_reasoning_base": {
14
+ "task": "gsm8k_reasoning_base",
15
+ "tag": [
16
+ "math_word_problems"
17
+ ],
18
+ "dataset_path": "gsm8k",
19
+ "dataset_name": "main",
20
+ "training_split": "train",
21
+ "test_split": "test",
22
+ "fewshot_split": "train",
23
+ "doc_to_text": "The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the final answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. In the answer mention each unknown and its solution, for example, <answer> boxed{10} </answer>. Now the user asks you to solve a math reasoning problem.\n\nUser:{{question}}\nAssistant: <think>",
24
+ "doc_to_target": "{{answer.split('####')[-1].strip() if answer is defined else target}}",
25
+ "unsafe_code": false,
26
+ "process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n candidates = results[0]\n\n # Extract from \\\\boxed{} if present\n boxed_answer = last_boxed_only_string(candidates)\n if boxed_answer is not None:\n try:\n boxed_content = remove_boxed(boxed_answer)\n if boxed_content is not None:\n candidates = boxed_content\n except (AssertionError, IndexError):\n pass\n\n # math_verify\n # print(\"=\"*100)\n # print(candidates, doc[\"answer\"].split('####')[-1].strip())\n # print(\"=\"*100)\n\n # Extract only digits from candidates\n raw_candidates = candidates\n if \"</answer>\" in candidates:\n candidates = candidates.split(\"</answer>\")[-2]\n if \"<answer>\" in candidates:\n candidates = candidates.split(\"<answer>\")[-1]\n candidates = candidates.split('=')[-1]\n if \"**\"in candidates:\n candidates = candidates.split(\"**\")[-2]\n if '$' in candidates:\n candidates = candidates.split('$')[-1]\n candidates = ''.join(c for c in candidates if c.isdigit() or c in \".-\")\n\n res = verify(parse(doc[\"answer\"].split('####')[-1].strip()), parse(candidates))\n mathval = 1 if res else 0\n if mathval == 0:\n gt = parse(doc[\"answer\"].split(\"####\")[-1].strip())\n print('=' * 80)\n print(f\"{parse(candidates)=}\")\n print(f\"{gt=}\")\n print(f\"{candidates=}\")\n print(f\"{repr(raw_candidates)=}\")\n\n results = {\n \"math_verify\": mathval,\n }\n return results\n",
27
+ "description": "",
28
+ "target_delimiter": " ",
29
+ "fewshot_delimiter": "\n\n",
30
+ "num_fewshot": 0,
31
+ "metric_list": [
32
+ {
33
+ "metric": "math_verify",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true
36
+ }
37
+ ],
38
+ "output_type": "generate_until",
39
+ "generation_kwargs": {
40
+ "until": [
41
+ "Question:",
42
+ "</s>",
43
+ "<|im_end|>"
44
+ ],
45
+ "do_sample": true,
46
+ "temperature": 1.0,
47
+ "max_gen_toks": 32768
48
+ },
49
+ "repeats": 1,
50
+ "should_decontaminate": false,
51
+ "metadata": {
52
+ "version": 3.0,
53
+ "pretrained": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000/",
54
+ "tensor_parallel_size": 8,
55
+ "dtype": "float32",
56
+ "gpu_memory_utilization": 0.9
57
+ }
58
+ }
59
+ },
60
+ "versions": {
61
+ "gsm8k_reasoning_base": 3.0
62
+ },
63
+ "n-shot": {
64
+ "gsm8k_reasoning_base": 0
65
+ },
66
+ "higher_is_better": {
67
+ "gsm8k_reasoning_base": {
68
+ "math_verify": true
69
+ }
70
+ },
71
+ "n-samples": {
72
+ "gsm8k_reasoning_base": {
73
+ "original": 1319,
74
+ "effective": 1319
75
+ }
76
+ },
77
+ "config": {
78
+ "model": "vllm",
79
+ "model_args": "pretrained=/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000/,tensor_parallel_size=8,dtype=float32,gpu_memory_utilization=0.9",
80
+ "batch_size": "auto",
81
+ "batch_sizes": [],
82
+ "device": null,
83
+ "use_cache": null,
84
+ "limit": null,
85
+ "bootstrap_iters": 100000,
86
+ "gen_kwargs": {
87
+ "do_sample": true,
88
+ "temperature": 1.0,
89
+ "max_gen_toks": 32768
90
+ },
91
+ "random_seed": 0,
92
+ "numpy_seed": 1234,
93
+ "torch_seed": 1234,
94
+ "fewshot_seed": 1234
95
+ },
96
+ "git_hash": "e9f1740",
97
+ "date": 1764113938.7952223,
98
+ "pretty_env_info": "PyTorch version: 2.7.0+cu126\nIs debug build: False\nCUDA used to build PyTorch: 12.6\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.31.7\nLibc version: glibc-2.35\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1088-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H200\nGPU 1: NVIDIA H200\nGPU 2: NVIDIA H200\nGPU 3: NVIDIA H200\nGPU 4: NVIDIA H200\nGPU 5: NVIDIA H200\nGPU 6: NVIDIA H200\nGPU 7: NVIDIA H200\n\nNvidia driver version: 570.133.20\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8480C\nCPU family: 6\nModel: 143\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 8\nBogoMIPS: 4000.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 avx512vbmi umip waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid cldemote movdiri movdir64b fsrm serialize amx_bf16 avx512_fp16 amx_tile amx_int8 arch_capabilities\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 4.5 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 192 MiB (96 instances)\nL3 cache: 210 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47\nNUMA node1 CPU(s): 48-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Unknown: No mitigations\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Vulnerable\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Retpoline\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] flake8==7.0.0\n[pip3] flashinfer-python==0.2.5+cu126torch2.6\n[pip3] mypy==1.10.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] numpydoc==1.7.0\n[pip3] nvidia-cublas-cu12==12.6.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.6.80\n[pip3] nvidia-cuda-nvrtc-cu12==12.6.77\n[pip3] nvidia-cuda-runtime-cu12==12.6.77\n[pip3] nvidia-cudnn-cu12==9.5.1.17\n[pip3] nvidia-cufft-cu12==11.3.0.4\n[pip3] nvidia-curand-cu12==10.3.7.77\n[pip3] nvidia-cusolver-cu12==11.7.1.2\n[pip3] nvidia-cusparse-cu12==12.5.4.2\n[pip3] nvidia-cusparselt-cu12==0.6.3\n[pip3] nvidia-nccl-cu12==2.26.2\n[pip3] nvidia-nvjitlink-cu12==12.6.85\n[pip3] nvidia-nvtx-cu12==12.6.77\n[pip3] torch==2.7.0\n[pip3] torchaudio==2.7.0\n[pip3] torchvision==0.22.0\n[pip3] triton==3.3.0\n[conda] _anaconda_depends 2024.06 py312_mkl_2 \n[conda] blas 1.0 mkl \n[conda] flashinfer-python 0.2.5+cu126torch2.6 pypi_0 pypi\n[conda] mkl 2023.1.0 h213fc3f_46344 \n[conda] mkl-service 2.4.0 py312h5eee18b_1 \n[conda] mkl_fft 1.3.8 py312h5eee18b_0 \n[conda] mkl_random 1.2.4 py312hdb19cb5_0 \n[conda] numpy 1.26.4 py312hc5e2394_0 \n[conda] numpy-base 1.26.4 py312h0da6c21_0 \n[conda] numpydoc 1.7.0 py312h06a4308_0 \n[conda] nvidia-cublas-cu12 12.6.4.1 pypi_0 pypi\n[conda] nvidia-cuda-cupti-cu12 12.6.80 pypi_0 pypi\n[conda] nvidia-cuda-nvrtc-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cuda-runtime-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cudnn-cu12 9.5.1.17 pypi_0 pypi\n[conda] nvidia-cufft-cu12 11.3.0.4 pypi_0 pypi\n[conda] nvidia-curand-cu12 10.3.7.77 pypi_0 pypi\n[conda] nvidia-cusolver-cu12 11.7.1.2 pypi_0 pypi\n[conda] nvidia-cusparse-cu12 12.5.4.2 pypi_0 pypi\n[conda] nvidia-cusparselt-cu12 0.6.3 pypi_0 pypi\n[conda] nvidia-nccl-cu12 2.26.2 pypi_0 pypi\n[conda] nvidia-nvjitlink-cu12 12.6.85 pypi_0 pypi\n[conda] nvidia-nvtx-cu12 12.6.77 pypi_0 pypi\n[conda] torch 2.7.0 pypi_0 pypi\n[conda] torchaudio 2.7.0 pypi_0 pypi\n[conda] torchvision 0.22.0 pypi_0 pypi\n[conda] triton 3.3.0 pypi_0 pypi",
99
+ "transformers_version": "4.53.3",
100
+ "lm_eval_version": "0.4.9.1",
101
+ "upper_git_hash": null,
102
+ "tokenizer_pad_token": [
103
+ "<|end_of_text|>",
104
+ "1"
105
+ ],
106
+ "tokenizer_eos_token": [
107
+ "<|end_of_text|>",
108
+ "1"
109
+ ],
110
+ "tokenizer_bos_token": [
111
+ "<|begin_of_text|>",
112
+ "0"
113
+ ],
114
+ "eot_token_id": 1,
115
+ "max_length": 32768,
116
+ "task_hashes": {
117
+ "gsm8k_reasoning_base": "727a4d754e27098dfdff5cd9b4a7bb1cfe61b92efda150fe0051c36b1c0a04ad"
118
+ },
119
+ "model_source": "vllm",
120
+ "model_name": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000/",
121
+ "model_name_sanitized": "__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__",
122
+ "system_instruction": null,
123
+ "system_instruction_sha": null,
124
+ "fewshot_as_multiturn": false,
125
+ "chat_template": null,
126
+ "chat_template_sha": null,
127
+ "start_time": 60649.515730168,
128
+ "end_time": 65471.732839258,
129
+ "total_evaluation_time_seconds": "4822.217109090001"
130
+ }
iter_1249000/eval_results/gsm8k_reasoning_base_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__/samples_gsm8k_reasoning_base_2025-11-26T00-59-11.177986.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:01727c3fade7f869b90320445fe1f222a4a8408b274ee31b4aee8f3c23d8b3a5
3
+ size 37855817
iter_1249000/eval_results/hellaswag_10shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T23-58-29.900269.json ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "hellaswag": {
4
+ "alias": "hellaswag",
5
+ "acc,none": 0.6917944632543318,
6
+ "acc_stderr,none": 0.0046080828155357035,
7
+ "acc_norm,none": 0.8778131846245768,
8
+ "acc_norm_stderr,none": 0.003268321260913458
9
+ }
10
+ },
11
+ "group_subtasks": {
12
+ "hellaswag": []
13
+ },
14
+ "configs": {
15
+ "hellaswag": {
16
+ "task": "hellaswag",
17
+ "tag": [
18
+ "multiple_choice"
19
+ ],
20
+ "dataset_path": "hellaswag",
21
+ "dataset_kwargs": {
22
+ "trust_remote_code": true
23
+ },
24
+ "training_split": "train",
25
+ "validation_split": "validation",
26
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
27
+ "doc_to_text": "{{query}}",
28
+ "doc_to_target": "{{label}}",
29
+ "unsafe_code": false,
30
+ "doc_to_choice": "choices",
31
+ "description": "",
32
+ "target_delimiter": " ",
33
+ "fewshot_delimiter": "\n\n",
34
+ "num_fewshot": 10,
35
+ "metric_list": [
36
+ {
37
+ "metric": "acc",
38
+ "aggregation": "mean",
39
+ "higher_is_better": true
40
+ },
41
+ {
42
+ "metric": "acc_norm",
43
+ "aggregation": "mean",
44
+ "higher_is_better": true
45
+ }
46
+ ],
47
+ "output_type": "multiple_choice",
48
+ "repeats": 1,
49
+ "should_decontaminate": false,
50
+ "metadata": {
51
+ "version": 1.0,
52
+ "pretrained": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
53
+ "tensor_parallel_size": 8,
54
+ "dtype": "float32",
55
+ "gpu_memory_utilization": 0.8
56
+ }
57
+ }
58
+ },
59
+ "versions": {
60
+ "hellaswag": 1.0
61
+ },
62
+ "n-shot": {
63
+ "hellaswag": 10
64
+ },
65
+ "higher_is_better": {
66
+ "hellaswag": {
67
+ "acc": true,
68
+ "acc_norm": true
69
+ }
70
+ },
71
+ "n-samples": {
72
+ "hellaswag": {
73
+ "original": 10042,
74
+ "effective": 10042
75
+ }
76
+ },
77
+ "config": {
78
+ "model": "vllm",
79
+ "model_args": "pretrained=/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000,tensor_parallel_size=8,dtype=float32,gpu_memory_utilization=0.8",
80
+ "batch_size": "1",
81
+ "batch_sizes": [],
82
+ "device": null,
83
+ "use_cache": null,
84
+ "limit": null,
85
+ "bootstrap_iters": 100000,
86
+ "gen_kwargs": null,
87
+ "random_seed": 0,
88
+ "numpy_seed": 1234,
89
+ "torch_seed": 1234,
90
+ "fewshot_seed": 1234
91
+ },
92
+ "git_hash": "a445a07",
93
+ "date": 1752779716.3588188,
94
+ "pretty_env_info": "PyTorch version: 2.7.0+cu126\nIs debug build: False\nCUDA used to build PyTorch: 12.6\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.31.7\nLibc version: glibc-2.35\n\nPython version: 3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:12:24) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1088-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H200\nGPU 1: NVIDIA H200\nGPU 2: NVIDIA H200\nGPU 3: NVIDIA H200\nGPU 4: NVIDIA H200\nGPU 5: NVIDIA H200\nGPU 6: NVIDIA H200\nGPU 7: NVIDIA H200\n\nNvidia driver version: 570.133.20\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8480C\nCPU family: 6\nModel: 143\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 8\nBogoMIPS: 3999.99\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 avx512vbmi umip waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid cldemote movdiri movdir64b fsrm serialize amx_bf16 avx512_fp16 amx_tile amx_int8 arch_capabilities\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 4.5 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 192 MiB (96 instances)\nL3 cache: 210 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47\nNUMA node1 CPU(s): 48-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Unknown: No mitigations\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Vulnerable\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Retpoline\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] flake8==7.0.0\n[pip3] flashinfer-python==0.2.5+cu126torch2.6\n[pip3] mypy==1.10.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] numpydoc==1.7.0\n[pip3] nvidia-cublas-cu12==12.6.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.6.80\n[pip3] nvidia-cuda-nvrtc-cu12==12.6.77\n[pip3] nvidia-cuda-runtime-cu12==12.6.77\n[pip3] nvidia-cudnn-cu12==9.5.1.17\n[pip3] nvidia-cufft-cu12==11.3.0.4\n[pip3] nvidia-curand-cu12==10.3.7.77\n[pip3] nvidia-cusolver-cu12==11.7.1.2\n[pip3] nvidia-cusparse-cu12==12.5.4.2\n[pip3] nvidia-cusparselt-cu12==0.6.3\n[pip3] nvidia-nccl-cu12==2.26.2\n[pip3] nvidia-nvjitlink-cu12==12.6.85\n[pip3] nvidia-nvtx-cu12==12.6.77\n[pip3] torch==2.7.0\n[pip3] torchaudio==2.7.0\n[pip3] torchvision==0.22.0\n[pip3] triton==3.3.0\n[conda] _anaconda_depends 2024.06 py312_mkl_2 \n[conda] blas 1.0 mkl \n[conda] flashinfer-python 0.2.5+cu126torch2.6 pypi_0 pypi\n[conda] mkl 2023.1.0 h213fc3f_46344 \n[conda] mkl-service 2.4.0 py312h5eee18b_1 \n[conda] mkl_fft 1.3.8 py312h5eee18b_0 \n[conda] mkl_random 1.2.4 py312hdb19cb5_0 \n[conda] numpy 1.26.4 py312hc5e2394_0 \n[conda] numpy-base 1.26.4 py312h0da6c21_0 \n[conda] numpydoc 1.7.0 py312h06a4308_0 \n[conda] nvidia-cublas-cu12 12.6.4.1 pypi_0 pypi\n[conda] nvidia-cuda-cupti-cu12 12.6.80 pypi_0 pypi\n[conda] nvidia-cuda-nvrtc-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cuda-runtime-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cudnn-cu12 9.5.1.17 pypi_0 pypi\n[conda] nvidia-cufft-cu12 11.3.0.4 pypi_0 pypi\n[conda] nvidia-curand-cu12 10.3.7.77 pypi_0 pypi\n[conda] nvidia-cusolver-cu12 11.7.1.2 pypi_0 pypi\n[conda] nvidia-cusparse-cu12 12.5.4.2 pypi_0 pypi\n[conda] nvidia-cusparselt-cu12 0.6.3 pypi_0 pypi\n[conda] nvidia-nccl-cu12 2.26.2 pypi_0 pypi\n[conda] nvidia-nvjitlink-cu12 12.6.85 pypi_0 pypi\n[conda] nvidia-nvtx-cu12 12.6.77 pypi_0 pypi\n[conda] torch 2.7.0 pypi_0 pypi\n[conda] torchaudio 2.7.0 pypi_0 pypi\n[conda] torchvision 0.22.0 pypi_0 pypi\n[conda] triton 3.3.0 pypi_0 pypi",
95
+ "transformers_version": "4.53.0.dev0",
96
+ "lm_eval_version": "0.4.8",
97
+ "upper_git_hash": null,
98
+ "tokenizer_pad_token": [
99
+ "<|end_of_text|>",
100
+ "1"
101
+ ],
102
+ "tokenizer_eos_token": [
103
+ "<|end_of_text|>",
104
+ "1"
105
+ ],
106
+ "tokenizer_bos_token": [
107
+ "<|begin_of_text|>",
108
+ "0"
109
+ ],
110
+ "eot_token_id": 1,
111
+ "max_length": 32768,
112
+ "task_hashes": {
113
+ "hellaswag": "d4bcb44ec68db2b8a65f050c3c64c48454179b48fd8aee3e73b55e2ec51e6d82"
114
+ },
115
+ "model_source": "vllm",
116
+ "model_name": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
117
+ "model_name_sanitized": "__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000",
118
+ "system_instruction": null,
119
+ "system_instruction_sha": null,
120
+ "fewshot_as_multiturn": false,
121
+ "chat_template": null,
122
+ "chat_template_sha": null,
123
+ "start_time": 1470954.839314792,
124
+ "end_time": 1487967.681337241,
125
+ "total_evaluation_time_seconds": "17012.84202244901"
126
+ }
iter_1249000/eval_results/hellaswag_10shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_hellaswag_2025-07-17T23-58-29.900269.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c9d41f492759ccbaa512715014c7f9fedbf37f04ce4d579ab84191b90ea7a812
3
+ size 187317968
iter_1249000/eval_results/humaneval_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T19-26-52.816826.json ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "humaneval": {
4
+ "alias": "humaneval",
5
+ "pass@1,create_test": 0.5,
6
+ "pass@1_stderr,create_test": 0.03916302249939787
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "humaneval": []
11
+ },
12
+ "configs": {
13
+ "humaneval": {
14
+ "task": "humaneval",
15
+ "dataset_path": "openai/openai_humaneval",
16
+ "test_split": "test",
17
+ "doc_to_text": "{{prompt}}",
18
+ "doc_to_target": "{{test}}\ncheck({{entry_point}})",
19
+ "unsafe_code": true,
20
+ "description": "",
21
+ "target_delimiter": " ",
22
+ "fewshot_delimiter": "\n\n",
23
+ "num_fewshot": 0,
24
+ "metric_list": [
25
+ {
26
+ "metric": "def pass_at_k(references: list[str], predictions: list[list[str]], k: list[int] = None):\n global compute_\n assert k is not None\n if isinstance(k, int):\n k = [k]\n res = compute_.compute(\n references=references,\n predictions=predictions,\n k=k,\n )\n return res[0]\n",
27
+ "aggregation": "mean",
28
+ "higher_is_better": true,
29
+ "k": [
30
+ 1
31
+ ]
32
+ }
33
+ ],
34
+ "output_type": "generate_until",
35
+ "generation_kwargs": {
36
+ "until": [
37
+ "\nclass",
38
+ "\ndef",
39
+ "\n#",
40
+ "\nif",
41
+ "\nprint"
42
+ ],
43
+ "max_gen_toks": 1024,
44
+ "do_sample": false
45
+ },
46
+ "repeats": 1,
47
+ "filter_list": [
48
+ {
49
+ "name": "create_test",
50
+ "filter": [
51
+ {
52
+ "function": "custom",
53
+ "filter_fn": "<function build_predictions at 0x150e2169b6a0>"
54
+ }
55
+ ]
56
+ }
57
+ ],
58
+ "should_decontaminate": false,
59
+ "metadata": {
60
+ "version": 1.0,
61
+ "pretrained": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
62
+ "tensor_parallel_size": 8,
63
+ "dtype": "float32",
64
+ "gpu_memory_utilization": 0.8
65
+ }
66
+ }
67
+ },
68
+ "versions": {
69
+ "humaneval": 1.0
70
+ },
71
+ "n-shot": {
72
+ "humaneval": 0
73
+ },
74
+ "higher_is_better": {
75
+ "humaneval": {
76
+ "pass_at_k": true
77
+ }
78
+ },
79
+ "n-samples": {
80
+ "humaneval": {
81
+ "original": 164,
82
+ "effective": 164
83
+ }
84
+ },
85
+ "config": {
86
+ "model": "vllm",
87
+ "model_args": "pretrained=/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000,tensor_parallel_size=8,dtype=float32,gpu_memory_utilization=0.8",
88
+ "batch_size": "1",
89
+ "batch_sizes": [],
90
+ "device": null,
91
+ "use_cache": null,
92
+ "limit": null,
93
+ "bootstrap_iters": 100000,
94
+ "gen_kwargs": null,
95
+ "random_seed": 0,
96
+ "numpy_seed": 1234,
97
+ "torch_seed": 1234,
98
+ "fewshot_seed": 1234
99
+ },
100
+ "git_hash": "a445a07",
101
+ "date": 1752779715.3442433,
102
+ "pretty_env_info": "PyTorch version: 2.7.0+cu126\nIs debug build: False\nCUDA used to build PyTorch: 12.6\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.31.7\nLibc version: glibc-2.35\n\nPython version: 3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:12:24) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1088-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H200\nGPU 1: NVIDIA H200\nGPU 2: NVIDIA H200\nGPU 3: NVIDIA H200\nGPU 4: NVIDIA H200\nGPU 5: NVIDIA H200\nGPU 6: NVIDIA H200\nGPU 7: NVIDIA H200\n\nNvidia driver version: 570.133.20\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8480C\nCPU family: 6\nModel: 143\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 8\nBogoMIPS: 4000.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 avx512vbmi umip waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid cldemote movdiri movdir64b fsrm serialize amx_bf16 avx512_fp16 amx_tile amx_int8 arch_capabilities\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 4.5 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 192 MiB (96 instances)\nL3 cache: 210 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47\nNUMA node1 CPU(s): 48-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Unknown: No mitigations\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Vulnerable\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Retpoline\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] flake8==7.0.0\n[pip3] flashinfer-python==0.2.5+cu126torch2.6\n[pip3] mypy==1.10.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] numpydoc==1.7.0\n[pip3] nvidia-cublas-cu12==12.6.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.6.80\n[pip3] nvidia-cuda-nvrtc-cu12==12.6.77\n[pip3] nvidia-cuda-runtime-cu12==12.6.77\n[pip3] nvidia-cudnn-cu12==9.5.1.17\n[pip3] nvidia-cufft-cu12==11.3.0.4\n[pip3] nvidia-curand-cu12==10.3.7.77\n[pip3] nvidia-cusolver-cu12==11.7.1.2\n[pip3] nvidia-cusparse-cu12==12.5.4.2\n[pip3] nvidia-cusparselt-cu12==0.6.3\n[pip3] nvidia-nccl-cu12==2.26.2\n[pip3] nvidia-nvjitlink-cu12==12.6.85\n[pip3] nvidia-nvtx-cu12==12.6.77\n[pip3] torch==2.7.0\n[pip3] torchaudio==2.7.0\n[pip3] torchvision==0.22.0\n[pip3] triton==3.3.0\n[conda] _anaconda_depends 2024.06 py312_mkl_2 \n[conda] blas 1.0 mkl \n[conda] flashinfer-python 0.2.5+cu126torch2.6 pypi_0 pypi\n[conda] mkl 2023.1.0 h213fc3f_46344 \n[conda] mkl-service 2.4.0 py312h5eee18b_1 \n[conda] mkl_fft 1.3.8 py312h5eee18b_0 \n[conda] mkl_random 1.2.4 py312hdb19cb5_0 \n[conda] numpy 1.26.4 py312hc5e2394_0 \n[conda] numpy-base 1.26.4 py312h0da6c21_0 \n[conda] numpydoc 1.7.0 py312h06a4308_0 \n[conda] nvidia-cublas-cu12 12.6.4.1 pypi_0 pypi\n[conda] nvidia-cuda-cupti-cu12 12.6.80 pypi_0 pypi\n[conda] nvidia-cuda-nvrtc-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cuda-runtime-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cudnn-cu12 9.5.1.17 pypi_0 pypi\n[conda] nvidia-cufft-cu12 11.3.0.4 pypi_0 pypi\n[conda] nvidia-curand-cu12 10.3.7.77 pypi_0 pypi\n[conda] nvidia-cusolver-cu12 11.7.1.2 pypi_0 pypi\n[conda] nvidia-cusparse-cu12 12.5.4.2 pypi_0 pypi\n[conda] nvidia-cusparselt-cu12 0.6.3 pypi_0 pypi\n[conda] nvidia-nccl-cu12 2.26.2 pypi_0 pypi\n[conda] nvidia-nvjitlink-cu12 12.6.85 pypi_0 pypi\n[conda] nvidia-nvtx-cu12 12.6.77 pypi_0 pypi\n[conda] torch 2.7.0 pypi_0 pypi\n[conda] torchaudio 2.7.0 pypi_0 pypi\n[conda] torchvision 0.22.0 pypi_0 pypi\n[conda] triton 3.3.0 pypi_0 pypi",
103
+ "transformers_version": "4.53.0.dev0",
104
+ "lm_eval_version": "0.4.8",
105
+ "upper_git_hash": null,
106
+ "tokenizer_pad_token": [
107
+ "<|end_of_text|>",
108
+ "1"
109
+ ],
110
+ "tokenizer_eos_token": [
111
+ "<|end_of_text|>",
112
+ "1"
113
+ ],
114
+ "tokenizer_bos_token": [
115
+ "<|begin_of_text|>",
116
+ "0"
117
+ ],
118
+ "eot_token_id": 1,
119
+ "max_length": 32768,
120
+ "task_hashes": {
121
+ "humaneval": "c122632f0bdffdd0162cc8033a879dfe9d3b35e82c52eda6a767069e0a2beb40"
122
+ },
123
+ "model_source": "vllm",
124
+ "model_name": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
125
+ "model_name_sanitized": "__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000",
126
+ "system_instruction": null,
127
+ "system_instruction_sha": null,
128
+ "fewshot_as_multiturn": false,
129
+ "chat_template": null,
130
+ "chat_template_sha": null,
131
+ "start_time": 1470959.94938541,
132
+ "end_time": 1471675.70521845,
133
+ "total_evaluation_time_seconds": "715.7558330399916"
134
+ }
iter_1249000/eval_results/humaneval_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_humaneval_2025-07-17T19-26-52.816826.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/humaneval_64_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-08-22T03-51-05.521361.json ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "humaneval_64": {
4
+ "alias": "humaneval_64",
5
+ "pass@2,create_test": 0.5489377661633754,
6
+ "pass@2_stderr,create_test": 0.0340805230997571,
7
+ "pass@8,create_test": 0.6579873034944809,
8
+ "pass@8_stderr,create_test": 0.0337464509487294,
9
+ "pass@16,create_test": 0.7013816146155691,
10
+ "pass@16_stderr,create_test": 0.03303964263380142,
11
+ "pass@32,create_test": 0.7398964514096209,
12
+ "pass@32_stderr,create_test": 0.03268103770634371,
13
+ "pass@64,create_test": 0.7682926829268293,
14
+ "pass@64_stderr,create_test": 0.033047561588107836
15
+ }
16
+ },
17
+ "group_subtasks": {
18
+ "humaneval_64": []
19
+ },
20
+ "configs": {
21
+ "humaneval_64": {
22
+ "task": "humaneval_64",
23
+ "dataset_path": "openai/openai_humaneval",
24
+ "test_split": "test",
25
+ "doc_to_text": "{{prompt}}",
26
+ "doc_to_target": "{{test}}\ncheck({{entry_point}})",
27
+ "unsafe_code": true,
28
+ "description": "",
29
+ "target_delimiter": " ",
30
+ "fewshot_delimiter": "\n\n",
31
+ "num_fewshot": 0,
32
+ "metric_list": [
33
+ {
34
+ "metric": "def pass_at_k(references: list[str], predictions: list[list[str]], k: list[int] = None):\n global compute_\n assert k is not None\n if isinstance(k, int):\n k = [k]\n res = compute_.compute(\n references=references,\n predictions=predictions,\n k=k,\n )\n return res[0]\n",
35
+ "aggregation": "mean",
36
+ "higher_is_better": true,
37
+ "k": [
38
+ 2,
39
+ 8,
40
+ 16,
41
+ 32,
42
+ 64
43
+ ]
44
+ }
45
+ ],
46
+ "output_type": "generate_until",
47
+ "generation_kwargs": {
48
+ "until": [
49
+ "\nclass",
50
+ "\ndef",
51
+ "\n#",
52
+ "\nif",
53
+ "\nprint"
54
+ ],
55
+ "max_gen_toks": 1024,
56
+ "do_sample": true,
57
+ "temperature": 0.2,
58
+ "top_p": 0.95
59
+ },
60
+ "repeats": 64,
61
+ "filter_list": [
62
+ {
63
+ "name": "create_test",
64
+ "filter": [
65
+ {
66
+ "function": "custom",
67
+ "filter_fn": "<function build_predictions at 0x14f666525080>"
68
+ }
69
+ ]
70
+ }
71
+ ],
72
+ "should_decontaminate": false,
73
+ "metadata": {
74
+ "version": 1.0,
75
+ "pretrained": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
76
+ "tensor_parallel_size": 8,
77
+ "dtype": "float32",
78
+ "gpu_memory_utilization": 0.9
79
+ }
80
+ }
81
+ },
82
+ "versions": {
83
+ "humaneval_64": 1.0
84
+ },
85
+ "n-shot": {
86
+ "humaneval_64": 0
87
+ },
88
+ "higher_is_better": {
89
+ "humaneval_64": {
90
+ "pass_at_k": true
91
+ }
92
+ },
93
+ "n-samples": {
94
+ "humaneval_64": {
95
+ "original": 164,
96
+ "effective": 164
97
+ }
98
+ },
99
+ "config": {
100
+ "model": "vllm",
101
+ "model_args": "pretrained=/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000,tensor_parallel_size=8,dtype=float32,gpu_memory_utilization=0.9",
102
+ "batch_size": "auto",
103
+ "batch_sizes": [],
104
+ "device": null,
105
+ "use_cache": null,
106
+ "limit": null,
107
+ "bootstrap_iters": 100000,
108
+ "gen_kwargs": null,
109
+ "random_seed": 0,
110
+ "numpy_seed": 1234,
111
+ "torch_seed": 1234,
112
+ "fewshot_seed": 1234
113
+ },
114
+ "git_hash": "bc23ea7",
115
+ "date": 1755832330.597868,
116
+ "pretty_env_info": "PyTorch version: 2.7.0+cu126\nIs debug build: False\nCUDA used to build PyTorch: 12.6\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.31.7\nLibc version: glibc-2.35\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1088-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H200\nGPU 1: NVIDIA H200\nGPU 2: NVIDIA H200\nGPU 3: NVIDIA H200\nGPU 4: NVIDIA H200\nGPU 5: NVIDIA H200\nGPU 6: NVIDIA H200\nGPU 7: NVIDIA H200\n\nNvidia driver version: 570.133.20\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8480C\nCPU family: 6\nModel: 143\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 8\nBogoMIPS: 3999.99\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 avx512vbmi umip waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid cldemote movdiri movdir64b fsrm serialize amx_bf16 avx512_fp16 amx_tile amx_int8 arch_capabilities\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 4.5 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 192 MiB (96 instances)\nL3 cache: 210 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47\nNUMA node1 CPU(s): 48-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Unknown: No mitigations\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Vulnerable\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Retpoline\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] flake8==7.0.0\n[pip3] flashinfer-python==0.2.5+cu126torch2.6\n[pip3] mypy==1.10.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] numpydoc==1.7.0\n[pip3] nvidia-cublas-cu12==12.6.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.6.80\n[pip3] nvidia-cuda-nvrtc-cu12==12.6.77\n[pip3] nvidia-cuda-runtime-cu12==12.6.77\n[pip3] nvidia-cudnn-cu12==9.5.1.17\n[pip3] nvidia-cufft-cu12==11.3.0.4\n[pip3] nvidia-curand-cu12==10.3.7.77\n[pip3] nvidia-cusolver-cu12==11.7.1.2\n[pip3] nvidia-cusparse-cu12==12.5.4.2\n[pip3] nvidia-cusparselt-cu12==0.6.3\n[pip3] nvidia-nccl-cu12==2.26.2\n[pip3] nvidia-nvjitlink-cu12==12.6.85\n[pip3] nvidia-nvtx-cu12==12.6.77\n[pip3] torch==2.7.0\n[pip3] torchaudio==2.7.0\n[pip3] torchvision==0.22.0\n[pip3] triton==3.3.0\n[conda] numpy 1.26.4 pypi_0 pypi\n[conda] nvidia-cublas-cu12 12.1.3.1 pypi_0 pypi\n[conda] nvidia-cuda-cupti-cu12 12.1.105 pypi_0 pypi\n[conda] nvidia-cuda-nvrtc-cu12 12.1.105 pypi_0 pypi\n[conda] nvidia-cuda-runtime-cu12 12.1.105 pypi_0 pypi\n[conda] nvidia-cudnn-cu12 9.1.0.70 pypi_0 pypi\n[conda] nvidia-cufft-cu12 11.0.2.54 pypi_0 pypi\n[conda] nvidia-curand-cu12 10.3.2.106 pypi_0 pypi\n[conda] nvidia-cusolver-cu12 11.4.5.107 pypi_0 pypi\n[conda] nvidia-cusparse-cu12 12.1.0.106 pypi_0 pypi\n[conda] nvidia-nccl-cu12 2.20.5 pypi_0 pypi\n[conda] nvidia-nvjitlink-cu12 12.9.86 pypi_0 pypi\n[conda] nvidia-nvtx-cu12 12.1.105 pypi_0 pypi\n[conda] torch 2.4.1 pypi_0 pypi\n[conda] triton 3.0.0 pypi_0 pypi",
117
+ "transformers_version": "4.53.0.dev0",
118
+ "lm_eval_version": "0.4.9.1",
119
+ "upper_git_hash": null,
120
+ "tokenizer_pad_token": [
121
+ "<|end_of_text|>",
122
+ "1"
123
+ ],
124
+ "tokenizer_eos_token": [
125
+ "<|end_of_text|>",
126
+ "1"
127
+ ],
128
+ "tokenizer_bos_token": [
129
+ "<|begin_of_text|>",
130
+ "0"
131
+ ],
132
+ "eot_token_id": 1,
133
+ "max_length": 32768,
134
+ "task_hashes": {
135
+ "humaneval_64": "c122632f0bdffdd0162cc8033a879dfe9d3b35e82c52eda6a767069e0a2beb40"
136
+ },
137
+ "model_source": "vllm",
138
+ "model_name": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
139
+ "model_name_sanitized": "__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000",
140
+ "system_instruction": null,
141
+ "system_instruction_sha": null,
142
+ "fewshot_as_multiturn": false,
143
+ "chat_template": null,
144
+ "chat_template_sha": null,
145
+ "start_time": 69255.504785092,
146
+ "end_time": 71603.678026169,
147
+ "total_evaluation_time_seconds": "2348.1732410770055"
148
+ }
iter_1249000/eval_results/humaneval_64_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_humaneval_64_2025-08-22T03-51-05.521361.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e0648dffef479001422c9be1420555df168a9063f0c2910915904f2ada4e8cdd
3
+ size 13131722
iter_1249000/eval_results/humaneval_64_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__/results_2025-11-25T01-35-23.860829.json ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "humaneval_64": {
4
+ "alias": "humaneval_64",
5
+ "pass@2,create_test": 0.5490285036778934,
6
+ "pass@2_stderr,create_test": 0.0340842438280205,
7
+ "pass@8,create_test": 0.6579901080473166,
8
+ "pass@8_stderr,create_test": 0.03374662453348631,
9
+ "pass@16,create_test": 0.7013816165516722,
10
+ "pass@16_stderr,create_test": 0.03303964274115624,
11
+ "pass@32,create_test": 0.7398964514096209,
12
+ "pass@32_stderr,create_test": 0.03268103770634371,
13
+ "pass@64,create_test": 0.7682926829268293,
14
+ "pass@64_stderr,create_test": 0.033047561588107836
15
+ }
16
+ },
17
+ "group_subtasks": {
18
+ "humaneval_64": []
19
+ },
20
+ "configs": {
21
+ "humaneval_64": {
22
+ "task": "humaneval_64",
23
+ "dataset_path": "openai/openai_humaneval",
24
+ "test_split": "test",
25
+ "doc_to_text": "{{prompt}}",
26
+ "doc_to_target": "{{test}}\ncheck({{entry_point}})",
27
+ "unsafe_code": true,
28
+ "description": "",
29
+ "target_delimiter": " ",
30
+ "fewshot_delimiter": "\n\n",
31
+ "num_fewshot": 0,
32
+ "metric_list": [
33
+ {
34
+ "metric": "def pass_at_k(references: list[str], predictions: list[list[str]], k: list[int] = None):\n global compute_\n assert k is not None\n if isinstance(k, int):\n k = [k]\n res = compute_.compute(\n references=references,\n predictions=predictions,\n k=k,\n )\n return res[0]\n",
35
+ "aggregation": "mean",
36
+ "higher_is_better": true,
37
+ "k": [
38
+ 2,
39
+ 8,
40
+ 16,
41
+ 32,
42
+ 64
43
+ ]
44
+ }
45
+ ],
46
+ "output_type": "generate_until",
47
+ "generation_kwargs": {
48
+ "until": [
49
+ "\nclass",
50
+ "\ndef",
51
+ "\n#",
52
+ "\nif",
53
+ "\nprint"
54
+ ],
55
+ "max_gen_toks": 1024,
56
+ "do_sample": true,
57
+ "temperature": 0.2,
58
+ "top_p": 0.95
59
+ },
60
+ "repeats": 64,
61
+ "filter_list": [
62
+ {
63
+ "name": "create_test",
64
+ "filter": [
65
+ {
66
+ "function": "custom",
67
+ "filter_fn": "<function build_predictions at 0x1518b1ea5d00>"
68
+ }
69
+ ]
70
+ }
71
+ ],
72
+ "should_decontaminate": false,
73
+ "metadata": {
74
+ "version": 1.0,
75
+ "pretrained": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000/",
76
+ "tensor_parallel_size": 8,
77
+ "dtype": "float32",
78
+ "gpu_memory_utilization": 0.9
79
+ }
80
+ }
81
+ },
82
+ "versions": {
83
+ "humaneval_64": 1.0
84
+ },
85
+ "n-shot": {
86
+ "humaneval_64": 0
87
+ },
88
+ "higher_is_better": {
89
+ "humaneval_64": {
90
+ "pass_at_k": true
91
+ }
92
+ },
93
+ "n-samples": {
94
+ "humaneval_64": {
95
+ "original": 164,
96
+ "effective": 164
97
+ }
98
+ },
99
+ "config": {
100
+ "model": "vllm",
101
+ "model_args": "pretrained=/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000/,tensor_parallel_size=8,dtype=float32,gpu_memory_utilization=0.9",
102
+ "batch_size": "auto",
103
+ "batch_sizes": [],
104
+ "device": null,
105
+ "use_cache": null,
106
+ "limit": null,
107
+ "bootstrap_iters": 100000,
108
+ "gen_kwargs": null,
109
+ "random_seed": 0,
110
+ "numpy_seed": 1234,
111
+ "torch_seed": 1234,
112
+ "fewshot_seed": 1234
113
+ },
114
+ "git_hash": "e9f1740",
115
+ "date": 1764032186.8127863,
116
+ "pretty_env_info": "PyTorch version: 2.7.0+cu126\nIs debug build: False\nCUDA used to build PyTorch: 12.6\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.31.7\nLibc version: glibc-2.35\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1088-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H200\nGPU 1: NVIDIA H200\nGPU 2: NVIDIA H200\nGPU 3: NVIDIA H200\nGPU 4: NVIDIA H200\nGPU 5: NVIDIA H200\nGPU 6: NVIDIA H200\nGPU 7: NVIDIA H200\n\nNvidia driver version: 570.133.20\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8480C\nCPU family: 6\nModel: 143\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 8\nBogoMIPS: 3999.99\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 avx512vbmi umip waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid cldemote movdiri movdir64b fsrm serialize amx_bf16 avx512_fp16 amx_tile amx_int8 arch_capabilities\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 4.5 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 192 MiB (96 instances)\nL3 cache: 210 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47\nNUMA node1 CPU(s): 48-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Unknown: No mitigations\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Vulnerable\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Retpoline\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] flake8==7.0.0\n[pip3] flashinfer-python==0.2.5+cu126torch2.6\n[pip3] mypy==1.10.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] numpydoc==1.7.0\n[pip3] nvidia-cublas-cu12==12.6.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.6.80\n[pip3] nvidia-cuda-nvrtc-cu12==12.6.77\n[pip3] nvidia-cuda-runtime-cu12==12.6.77\n[pip3] nvidia-cudnn-cu12==9.5.1.17\n[pip3] nvidia-cufft-cu12==11.3.0.4\n[pip3] nvidia-curand-cu12==10.3.7.77\n[pip3] nvidia-cusolver-cu12==11.7.1.2\n[pip3] nvidia-cusparse-cu12==12.5.4.2\n[pip3] nvidia-cusparselt-cu12==0.6.3\n[pip3] nvidia-nccl-cu12==2.26.2\n[pip3] nvidia-nvjitlink-cu12==12.6.85\n[pip3] nvidia-nvtx-cu12==12.6.77\n[pip3] torch==2.7.0\n[pip3] torchaudio==2.7.0\n[pip3] torchvision==0.22.0\n[pip3] triton==3.3.0\n[conda] _anaconda_depends 2024.06 py312_mkl_2 \n[conda] blas 1.0 mkl \n[conda] flashinfer-python 0.2.5+cu126torch2.6 pypi_0 pypi\n[conda] mkl 2023.1.0 h213fc3f_46344 \n[conda] mkl-service 2.4.0 py312h5eee18b_1 \n[conda] mkl_fft 1.3.8 py312h5eee18b_0 \n[conda] mkl_random 1.2.4 py312hdb19cb5_0 \n[conda] numpy 1.26.4 py312hc5e2394_0 \n[conda] numpy-base 1.26.4 py312h0da6c21_0 \n[conda] numpydoc 1.7.0 py312h06a4308_0 \n[conda] nvidia-cublas-cu12 12.6.4.1 pypi_0 pypi\n[conda] nvidia-cuda-cupti-cu12 12.6.80 pypi_0 pypi\n[conda] nvidia-cuda-nvrtc-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cuda-runtime-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cudnn-cu12 9.5.1.17 pypi_0 pypi\n[conda] nvidia-cufft-cu12 11.3.0.4 pypi_0 pypi\n[conda] nvidia-curand-cu12 10.3.7.77 pypi_0 pypi\n[conda] nvidia-cusolver-cu12 11.7.1.2 pypi_0 pypi\n[conda] nvidia-cusparse-cu12 12.5.4.2 pypi_0 pypi\n[conda] nvidia-cusparselt-cu12 0.6.3 pypi_0 pypi\n[conda] nvidia-nccl-cu12 2.26.2 pypi_0 pypi\n[conda] nvidia-nvjitlink-cu12 12.6.85 pypi_0 pypi\n[conda] nvidia-nvtx-cu12 12.6.77 pypi_0 pypi\n[conda] torch 2.7.0 pypi_0 pypi\n[conda] torchaudio 2.7.0 pypi_0 pypi\n[conda] torchvision 0.22.0 pypi_0 pypi\n[conda] triton 3.3.0 pypi_0 pypi",
117
+ "transformers_version": "4.53.3",
118
+ "lm_eval_version": "0.4.9.1",
119
+ "upper_git_hash": null,
120
+ "tokenizer_pad_token": [
121
+ "<|end_of_text|>",
122
+ "1"
123
+ ],
124
+ "tokenizer_eos_token": [
125
+ "<|end_of_text|>",
126
+ "1"
127
+ ],
128
+ "tokenizer_bos_token": [
129
+ "<|begin_of_text|>",
130
+ "0"
131
+ ],
132
+ "eot_token_id": 1,
133
+ "max_length": 32768,
134
+ "task_hashes": {
135
+ "humaneval_64": "c122632f0bdffdd0162cc8033a879dfe9d3b35e82c52eda6a767069e0a2beb40"
136
+ },
137
+ "model_source": "vllm",
138
+ "model_name": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000/",
139
+ "model_name_sanitized": "__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__",
140
+ "system_instruction": null,
141
+ "system_instruction_sha": null,
142
+ "fewshot_as_multiturn": false,
143
+ "chat_template": null,
144
+ "chat_template_sha": null,
145
+ "start_time": 2356619.571626194,
146
+ "end_time": 2358988.00763084,
147
+ "total_evaluation_time_seconds": "2368.4360046461225"
148
+ }
iter_1249000/eval_results/humaneval_64_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__/samples_humaneval_64_2025-11-25T01-35-23.860829.jsonl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:33c4be2197c48883fb25516739ea127716b0d9ba8d2c370c17d3e6a94b27edfc
3
+ size 13131722
iter_1249000/eval_results/ifeval_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T20-13-52.015185.json ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "ifeval": {
4
+ "alias": "ifeval",
5
+ "prompt_level_strict_acc,none": 0.1256931608133087,
6
+ "prompt_level_strict_acc_stderr,none": 0.014265627567173898,
7
+ "inst_level_strict_acc,none": 0.22302158273381295,
8
+ "inst_level_strict_acc_stderr,none": "N/A",
9
+ "prompt_level_loose_acc,none": 0.133086876155268,
10
+ "prompt_level_loose_acc_stderr,none": 0.014617009342904457,
11
+ "inst_level_loose_acc,none": 0.23381294964028776,
12
+ "inst_level_loose_acc_stderr,none": "N/A"
13
+ }
14
+ },
15
+ "group_subtasks": {
16
+ "ifeval": []
17
+ },
18
+ "configs": {
19
+ "ifeval": {
20
+ "task": "ifeval",
21
+ "dataset_path": "google/IFEval",
22
+ "test_split": "train",
23
+ "doc_to_text": "prompt",
24
+ "doc_to_target": 0,
25
+ "unsafe_code": false,
26
+ "process_results": "def process_results(doc, results):\n inp = InputExample(\n key=doc[\"key\"],\n instruction_id_list=doc[\"instruction_id_list\"],\n prompt=doc[\"prompt\"],\n kwargs=doc[\"kwargs\"],\n )\n response = results[0]\n\n out_strict = test_instruction_following_strict(inp, response)\n out_loose = test_instruction_following_loose(inp, response)\n\n return {\n \"prompt_level_strict_acc\": out_strict.follow_all_instructions,\n \"inst_level_strict_acc\": out_strict.follow_instruction_list,\n \"prompt_level_loose_acc\": out_loose.follow_all_instructions,\n \"inst_level_loose_acc\": out_loose.follow_instruction_list,\n }\n",
27
+ "description": "",
28
+ "target_delimiter": " ",
29
+ "fewshot_delimiter": "\n\n",
30
+ "num_fewshot": 0,
31
+ "metric_list": [
32
+ {
33
+ "metric": "prompt_level_strict_acc",
34
+ "aggregation": "mean",
35
+ "higher_is_better": true
36
+ },
37
+ {
38
+ "metric": "inst_level_strict_acc",
39
+ "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n",
40
+ "higher_is_better": true
41
+ },
42
+ {
43
+ "metric": "prompt_level_loose_acc",
44
+ "aggregation": "mean",
45
+ "higher_is_better": true
46
+ },
47
+ {
48
+ "metric": "inst_level_loose_acc",
49
+ "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n",
50
+ "higher_is_better": true
51
+ }
52
+ ],
53
+ "output_type": "generate_until",
54
+ "generation_kwargs": {
55
+ "until": [],
56
+ "do_sample": false,
57
+ "temperature": 0.0,
58
+ "max_gen_toks": 1280
59
+ },
60
+ "repeats": 1,
61
+ "should_decontaminate": false,
62
+ "metadata": {
63
+ "version": 4.0,
64
+ "pretrained": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
65
+ "tensor_parallel_size": 8,
66
+ "dtype": "float32",
67
+ "gpu_memory_utilization": 0.8
68
+ }
69
+ }
70
+ },
71
+ "versions": {
72
+ "ifeval": 4.0
73
+ },
74
+ "n-shot": {
75
+ "ifeval": 0
76
+ },
77
+ "higher_is_better": {
78
+ "ifeval": {
79
+ "prompt_level_strict_acc": true,
80
+ "inst_level_strict_acc": true,
81
+ "prompt_level_loose_acc": true,
82
+ "inst_level_loose_acc": true
83
+ }
84
+ },
85
+ "n-samples": {
86
+ "ifeval": {
87
+ "original": 541,
88
+ "effective": 541
89
+ }
90
+ },
91
+ "config": {
92
+ "model": "vllm",
93
+ "model_args": "pretrained=/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000,tensor_parallel_size=8,dtype=float32,gpu_memory_utilization=0.8",
94
+ "batch_size": "auto",
95
+ "batch_sizes": [],
96
+ "device": null,
97
+ "use_cache": null,
98
+ "limit": null,
99
+ "bootstrap_iters": 100000,
100
+ "gen_kwargs": null,
101
+ "random_seed": 0,
102
+ "numpy_seed": 1234,
103
+ "torch_seed": 1234,
104
+ "fewshot_seed": 1234
105
+ },
106
+ "git_hash": "a445a07",
107
+ "date": 1752782646.8382735,
108
+ "pretty_env_info": "PyTorch version: 2.7.0+cu126\nIs debug build: False\nCUDA used to build PyTorch: 12.6\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.31.7\nLibc version: glibc-2.35\n\nPython version: 3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:12:24) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1088-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H200\nGPU 1: NVIDIA H200\nGPU 2: NVIDIA H200\nGPU 3: NVIDIA H200\nGPU 4: NVIDIA H200\nGPU 5: NVIDIA H200\nGPU 6: NVIDIA H200\nGPU 7: NVIDIA H200\n\nNvidia driver version: 570.133.20\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8480C\nCPU family: 6\nModel: 143\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 8\nBogoMIPS: 4000.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 avx512vbmi umip waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid cldemote movdiri movdir64b fsrm serialize amx_bf16 avx512_fp16 amx_tile amx_int8 arch_capabilities\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 4.5 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 192 MiB (96 instances)\nL3 cache: 210 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47\nNUMA node1 CPU(s): 48-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Unknown: No mitigations\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Vulnerable\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Retpoline\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] flake8==7.0.0\n[pip3] flashinfer-python==0.2.5+cu126torch2.6\n[pip3] mypy==1.10.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] numpydoc==1.7.0\n[pip3] nvidia-cublas-cu12==12.6.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.6.80\n[pip3] nvidia-cuda-nvrtc-cu12==12.6.77\n[pip3] nvidia-cuda-runtime-cu12==12.6.77\n[pip3] nvidia-cudnn-cu12==9.5.1.17\n[pip3] nvidia-cufft-cu12==11.3.0.4\n[pip3] nvidia-curand-cu12==10.3.7.77\n[pip3] nvidia-cusolver-cu12==11.7.1.2\n[pip3] nvidia-cusparse-cu12==12.5.4.2\n[pip3] nvidia-cusparselt-cu12==0.6.3\n[pip3] nvidia-nccl-cu12==2.26.2\n[pip3] nvidia-nvjitlink-cu12==12.6.85\n[pip3] nvidia-nvtx-cu12==12.6.77\n[pip3] torch==2.7.0\n[pip3] torchaudio==2.7.0\n[pip3] torchvision==0.22.0\n[pip3] triton==3.3.0\n[conda] _anaconda_depends 2024.06 py312_mkl_2 \n[conda] blas 1.0 mkl \n[conda] flashinfer-python 0.2.5+cu126torch2.6 pypi_0 pypi\n[conda] mkl 2023.1.0 h213fc3f_46344 \n[conda] mkl-service 2.4.0 py312h5eee18b_1 \n[conda] mkl_fft 1.3.8 py312h5eee18b_0 \n[conda] mkl_random 1.2.4 py312hdb19cb5_0 \n[conda] numpy 1.26.4 py312hc5e2394_0 \n[conda] numpy-base 1.26.4 py312h0da6c21_0 \n[conda] numpydoc 1.7.0 py312h06a4308_0 \n[conda] nvidia-cublas-cu12 12.6.4.1 pypi_0 pypi\n[conda] nvidia-cuda-cupti-cu12 12.6.80 pypi_0 pypi\n[conda] nvidia-cuda-nvrtc-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cuda-runtime-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cudnn-cu12 9.5.1.17 pypi_0 pypi\n[conda] nvidia-cufft-cu12 11.3.0.4 pypi_0 pypi\n[conda] nvidia-curand-cu12 10.3.7.77 pypi_0 pypi\n[conda] nvidia-cusolver-cu12 11.7.1.2 pypi_0 pypi\n[conda] nvidia-cusparse-cu12 12.5.4.2 pypi_0 pypi\n[conda] nvidia-cusparselt-cu12 0.6.3 pypi_0 pypi\n[conda] nvidia-nccl-cu12 2.26.2 pypi_0 pypi\n[conda] nvidia-nvjitlink-cu12 12.6.85 pypi_0 pypi\n[conda] nvidia-nvtx-cu12 12.6.77 pypi_0 pypi\n[conda] torch 2.7.0 pypi_0 pypi\n[conda] torchaudio 2.7.0 pypi_0 pypi\n[conda] torchvision 0.22.0 pypi_0 pypi\n[conda] triton 3.3.0 pypi_0 pypi",
109
+ "transformers_version": "4.53.0.dev0",
110
+ "lm_eval_version": "0.4.8",
111
+ "upper_git_hash": null,
112
+ "tokenizer_pad_token": [
113
+ "<|end_of_text|>",
114
+ "1"
115
+ ],
116
+ "tokenizer_eos_token": [
117
+ "<|end_of_text|>",
118
+ "1"
119
+ ],
120
+ "tokenizer_bos_token": [
121
+ "<|begin_of_text|>",
122
+ "0"
123
+ ],
124
+ "eot_token_id": 1,
125
+ "max_length": 32768,
126
+ "task_hashes": {
127
+ "ifeval": "a9cc24d7d92904c9f59225bb28b88b892d9ab82be222808ea7fa345ffd4500ae"
128
+ },
129
+ "model_source": "vllm",
130
+ "model_name": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
131
+ "model_name_sanitized": "__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000",
132
+ "system_instruction": null,
133
+ "system_instruction_sha": null,
134
+ "fewshot_as_multiturn": false,
135
+ "chat_template": null,
136
+ "chat_template_sha": null,
137
+ "start_time": 1473896.634510482,
138
+ "end_time": 1474487.194394837,
139
+ "total_evaluation_time_seconds": "590.5598843549378"
140
+ }
iter_1249000/eval_results/ifeval_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_ifeval_2025-07-17T20-13-52.015185.jsonl ADDED
The diff for this file is too large to render. See raw diff
 
iter_1249000/eval_results/leaderboard_gpqa_diamond/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T20-26-10.269030.json ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "results": {
3
+ "leaderboard_gpqa_diamond": {
4
+ "alias": "leaderboard_gpqa_diamond",
5
+ "acc_norm,none": 0.26262626262626265,
6
+ "acc_norm_stderr,none": 0.031353050095330834
7
+ }
8
+ },
9
+ "group_subtasks": {
10
+ "leaderboard_gpqa_diamond": []
11
+ },
12
+ "configs": {
13
+ "leaderboard_gpqa_diamond": {
14
+ "task": "leaderboard_gpqa_diamond",
15
+ "dataset_path": "Idavidrein/gpqa",
16
+ "dataset_name": "gpqa_diamond",
17
+ "training_split": "train",
18
+ "validation_split": "train",
19
+ "process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n choices = [\n preprocess(doc[\"Incorrect Answer 1\"]),\n preprocess(doc[\"Incorrect Answer 2\"]),\n preprocess(doc[\"Incorrect Answer 3\"]),\n preprocess(doc[\"Correct Answer\"]),\n ]\n\n random.shuffle(choices)\n correct_answer_index = choices.index(preprocess(doc[\"Correct Answer\"]))\n\n out_doc = {\n \"choice1\": choices[0],\n \"choice2\": choices[1],\n \"choice3\": choices[2],\n \"choice4\": choices[3],\n \"answer\": f\"({chr(65 + correct_answer_index)})\",\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
20
+ "doc_to_text": "What is the correct answer to this question:{{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nAnswer: ",
21
+ "doc_to_target": "answer",
22
+ "unsafe_code": false,
23
+ "doc_to_choice": [
24
+ "(A)",
25
+ "(B)",
26
+ "(C)",
27
+ "(D)"
28
+ ],
29
+ "description": "",
30
+ "target_delimiter": " ",
31
+ "fewshot_delimiter": "\n\n",
32
+ "fewshot_config": {
33
+ "sampler": "first_n"
34
+ },
35
+ "num_fewshot": 0,
36
+ "metric_list": [
37
+ {
38
+ "metric": "acc_norm",
39
+ "aggregation": "mean",
40
+ "higher_is_better": true
41
+ }
42
+ ],
43
+ "output_type": "multiple_choice",
44
+ "repeats": 1,
45
+ "should_decontaminate": false,
46
+ "metadata": {
47
+ "version": 1.0,
48
+ "pretrained": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
49
+ "tensor_parallel_size": 8,
50
+ "dtype": "float32",
51
+ "gpu_memory_utilization": 0.7
52
+ }
53
+ }
54
+ },
55
+ "versions": {
56
+ "leaderboard_gpqa_diamond": 1.0
57
+ },
58
+ "n-shot": {
59
+ "leaderboard_gpqa_diamond": 0
60
+ },
61
+ "higher_is_better": {
62
+ "leaderboard_gpqa_diamond": {
63
+ "acc_norm": true
64
+ }
65
+ },
66
+ "n-samples": {
67
+ "leaderboard_gpqa_diamond": {
68
+ "original": 198,
69
+ "effective": 198
70
+ }
71
+ },
72
+ "config": {
73
+ "model": "vllm",
74
+ "model_args": "pretrained=/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000,tensor_parallel_size=8,dtype=float32,gpu_memory_utilization=0.7",
75
+ "batch_size": "1",
76
+ "batch_sizes": [],
77
+ "device": null,
78
+ "use_cache": null,
79
+ "limit": null,
80
+ "bootstrap_iters": 100000,
81
+ "gen_kwargs": null,
82
+ "random_seed": 0,
83
+ "numpy_seed": 1234,
84
+ "torch_seed": 1234,
85
+ "fewshot_seed": 1234
86
+ },
87
+ "git_hash": "a445a07",
88
+ "date": 1752783576.6566072,
89
+ "pretty_env_info": "PyTorch version: 2.7.0+cu126\nIs debug build: False\nCUDA used to build PyTorch: 12.6\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.31.7\nLibc version: glibc-2.35\n\nPython version: 3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:12:24) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1088-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H200\nGPU 1: NVIDIA H200\nGPU 2: NVIDIA H200\nGPU 3: NVIDIA H200\nGPU 4: NVIDIA H200\nGPU 5: NVIDIA H200\nGPU 6: NVIDIA H200\nGPU 7: NVIDIA H200\n\nNvidia driver version: 570.133.20\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8480C\nCPU family: 6\nModel: 143\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 8\nBogoMIPS: 3999.99\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 avx512vbmi umip waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid cldemote movdiri movdir64b fsrm serialize amx_bf16 avx512_fp16 amx_tile amx_int8 arch_capabilities\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 4.5 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 192 MiB (96 instances)\nL3 cache: 210 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47\nNUMA node1 CPU(s): 48-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Unknown: No mitigations\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Vulnerable\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Retpoline\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] flake8==7.0.0\n[pip3] flashinfer-python==0.2.5+cu126torch2.6\n[pip3] mypy==1.10.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] numpydoc==1.7.0\n[pip3] nvidia-cublas-cu12==12.6.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.6.80\n[pip3] nvidia-cuda-nvrtc-cu12==12.6.77\n[pip3] nvidia-cuda-runtime-cu12==12.6.77\n[pip3] nvidia-cudnn-cu12==9.5.1.17\n[pip3] nvidia-cufft-cu12==11.3.0.4\n[pip3] nvidia-curand-cu12==10.3.7.77\n[pip3] nvidia-cusolver-cu12==11.7.1.2\n[pip3] nvidia-cusparse-cu12==12.5.4.2\n[pip3] nvidia-cusparselt-cu12==0.6.3\n[pip3] nvidia-nccl-cu12==2.26.2\n[pip3] nvidia-nvjitlink-cu12==12.6.85\n[pip3] nvidia-nvtx-cu12==12.6.77\n[pip3] torch==2.7.0\n[pip3] torchaudio==2.7.0\n[pip3] torchvision==0.22.0\n[pip3] triton==3.3.0\n[conda] _anaconda_depends 2024.06 py312_mkl_2 \n[conda] blas 1.0 mkl \n[conda] flashinfer-python 0.2.5+cu126torch2.6 pypi_0 pypi\n[conda] mkl 2023.1.0 h213fc3f_46344 \n[conda] mkl-service 2.4.0 py312h5eee18b_1 \n[conda] mkl_fft 1.3.8 py312h5eee18b_0 \n[conda] mkl_random 1.2.4 py312hdb19cb5_0 \n[conda] numpy 1.26.4 py312hc5e2394_0 \n[conda] numpy-base 1.26.4 py312h0da6c21_0 \n[conda] numpydoc 1.7.0 py312h06a4308_0 \n[conda] nvidia-cublas-cu12 12.6.4.1 pypi_0 pypi\n[conda] nvidia-cuda-cupti-cu12 12.6.80 pypi_0 pypi\n[conda] nvidia-cuda-nvrtc-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cuda-runtime-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cudnn-cu12 9.5.1.17 pypi_0 pypi\n[conda] nvidia-cufft-cu12 11.3.0.4 pypi_0 pypi\n[conda] nvidia-curand-cu12 10.3.7.77 pypi_0 pypi\n[conda] nvidia-cusolver-cu12 11.7.1.2 pypi_0 pypi\n[conda] nvidia-cusparse-cu12 12.5.4.2 pypi_0 pypi\n[conda] nvidia-cusparselt-cu12 0.6.3 pypi_0 pypi\n[conda] nvidia-nccl-cu12 2.26.2 pypi_0 pypi\n[conda] nvidia-nvjitlink-cu12 12.6.85 pypi_0 pypi\n[conda] nvidia-nvtx-cu12 12.6.77 pypi_0 pypi\n[conda] torch 2.7.0 pypi_0 pypi\n[conda] torchaudio 2.7.0 pypi_0 pypi\n[conda] torchvision 0.22.0 pypi_0 pypi\n[conda] triton 3.3.0 pypi_0 pypi",
90
+ "transformers_version": "4.53.0.dev0",
91
+ "lm_eval_version": "0.4.8",
92
+ "upper_git_hash": null,
93
+ "tokenizer_pad_token": [
94
+ "<|end_of_text|>",
95
+ "1"
96
+ ],
97
+ "tokenizer_eos_token": [
98
+ "<|end_of_text|>",
99
+ "1"
100
+ ],
101
+ "tokenizer_bos_token": [
102
+ "<|begin_of_text|>",
103
+ "0"
104
+ ],
105
+ "eot_token_id": 1,
106
+ "max_length": 32768,
107
+ "task_hashes": {
108
+ "leaderboard_gpqa_diamond": "45f449f3b3dfc0be532cf1913f1559cf9d7645e56ec3c3fe01317fc575a54e3d"
109
+ },
110
+ "model_source": "vllm",
111
+ "model_name": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
112
+ "model_name_sanitized": "__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000",
113
+ "system_instruction": null,
114
+ "system_instruction_sha": null,
115
+ "fewshot_as_multiturn": false,
116
+ "chat_template": null,
117
+ "chat_template_sha": null,
118
+ "start_time": 1474830.813494996,
119
+ "end_time": 1475231.21819884,
120
+ "total_evaluation_time_seconds": "400.4047038441058"
121
+ }
iter_1249000/eval_results/leaderboard_gpqa_diamond/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_leaderboard_gpqa_diamond_2025-07-17T20-26-10.269030.jsonl ADDED
The diff for this file is too large to render. See raw diff