Upload base_final
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +10 -0
- iter_1249000/config.json +30 -0
- iter_1249000/eval_results/arc_challenge_25shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T19-57-20.454173.json +125 -0
- iter_1249000/eval_results/arc_challenge_25shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_arc_challenge_2025-07-17T19-57-20.454173.jsonl +3 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T20-18-56.328280.json +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_boolean_expressions_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_causal_judgement_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_date_understanding_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_disambiguation_qa_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_dyck_languages_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_formal_fallacies_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_geometric_shapes_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_hyperbaton_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_logical_deduction_five_objects_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_logical_deduction_seven_objects_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_logical_deduction_three_objects_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_movie_recommendation_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_multistep_arithmetic_two_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_navigate_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_object_counting_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_penguins_in_a_table_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_reasoning_about_colored_objects_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_ruin_names_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_salient_translation_error_detection_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_snarks_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_sports_understanding_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_temporal_sequences_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_tracking_shuffled_objects_five_objects_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_tracking_shuffled_objects_seven_objects_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_tracking_shuffled_objects_three_objects_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_web_of_lies_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_word_sorting_2025-07-17T20-18-56.328280.jsonl +0 -0
- iter_1249000/eval_results/gsm8k_5shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T20-44-20.876997.json +161 -0
- iter_1249000/eval_results/gsm8k_5shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_gsm8k_2025-07-17T20-44-20.876997.jsonl +3 -0
- iter_1249000/eval_results/gsm8k_cot_8shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-08-07T20-33-19.023288.json +195 -0
- iter_1249000/eval_results/gsm8k_cot_8shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_gsm8k_cot_2025-08-07T20-33-19.023288.jsonl +0 -0
- iter_1249000/eval_results/gsm8k_reasoning_base_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__/results_2025-11-26T00-59-11.177986.json +130 -0
- iter_1249000/eval_results/gsm8k_reasoning_base_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__/samples_gsm8k_reasoning_base_2025-11-26T00-59-11.177986.jsonl +3 -0
- iter_1249000/eval_results/hellaswag_10shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T23-58-29.900269.json +126 -0
- iter_1249000/eval_results/hellaswag_10shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_hellaswag_2025-07-17T23-58-29.900269.jsonl +3 -0
- iter_1249000/eval_results/humaneval_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T19-26-52.816826.json +134 -0
- iter_1249000/eval_results/humaneval_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_humaneval_2025-07-17T19-26-52.816826.jsonl +0 -0
- iter_1249000/eval_results/humaneval_64_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-08-22T03-51-05.521361.json +148 -0
- iter_1249000/eval_results/humaneval_64_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_humaneval_64_2025-08-22T03-51-05.521361.jsonl +3 -0
- iter_1249000/eval_results/humaneval_64_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__/results_2025-11-25T01-35-23.860829.json +148 -0
- iter_1249000/eval_results/humaneval_64_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__/samples_humaneval_64_2025-11-25T01-35-23.860829.jsonl +3 -0
- iter_1249000/eval_results/ifeval_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T20-13-52.015185.json +140 -0
- iter_1249000/eval_results/ifeval_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_ifeval_2025-07-17T20-13-52.015185.jsonl +0 -0
- iter_1249000/eval_results/leaderboard_gpqa_diamond/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T20-26-10.269030.json +121 -0
- iter_1249000/eval_results/leaderboard_gpqa_diamond/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_leaderboard_gpqa_diamond_2025-07-17T20-26-10.269030.jsonl +0 -0
.gitattributes
CHANGED
|
@@ -36,3 +36,13 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
250k/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 38 |
figures/sft-models.png filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 37 |
250k/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
| 38 |
figures/sft-models.png filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
iter_1249000/eval_results/arc_challenge_25shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_arc_challenge_2025-07-17T19-57-20.454173.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
iter_1249000/eval_results/gsm8k_5shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_gsm8k_2025-07-17T20-44-20.876997.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
iter_1249000/eval_results/gsm8k_reasoning_base_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__/samples_gsm8k_reasoning_base_2025-11-26T00-59-11.177986.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
iter_1249000/eval_results/hellaswag_10shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_hellaswag_2025-07-17T23-58-29.900269.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
iter_1249000/eval_results/humaneval_64_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_humaneval_64_2025-08-22T03-51-05.521361.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
iter_1249000/eval_results/humaneval_64_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__/samples_humaneval_64_2025-11-25T01-35-23.860829.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
iter_1249000/eval_results/minerva_math500_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_minerva_math500_2025-11-28T05-06-05.219253.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
iter_1249000/eval_results/mmlu_5shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_mmlu_professional_law_2025-07-18T00-49-44.450953.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
iter_1249000/eval_results/mmlu_pro_5shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_mmlu_pro_law_2025-07-17T21-35-56.511842.jsonl filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
iter_1249000/tokenizer.json filter=lfs diff=lfs merge=lfs -text
|
iter_1249000/config.json
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"architectures": [
|
| 3 |
+
"LlamaForCausalLM"
|
| 4 |
+
],
|
| 5 |
+
"attention_bias": false,
|
| 6 |
+
"attention_dropout": 0.0,
|
| 7 |
+
"bos_token_id": 0,
|
| 8 |
+
"eos_token_id": 1,
|
| 9 |
+
"head_dim": 128,
|
| 10 |
+
"hidden_act": "silu",
|
| 11 |
+
"hidden_size": 8192,
|
| 12 |
+
"initializer_range": 0.02,
|
| 13 |
+
"intermediate_size": 28672,
|
| 14 |
+
"max_position_embeddings": 32768,
|
| 15 |
+
"mlp_bias": false,
|
| 16 |
+
"model_type": "llama",
|
| 17 |
+
"num_attention_heads": 64,
|
| 18 |
+
"num_hidden_layers": 80,
|
| 19 |
+
"num_key_value_heads": 8,
|
| 20 |
+
"num_local_experts": null,
|
| 21 |
+
"pretraining_tp": 1,
|
| 22 |
+
"rms_norm_eps": 1e-05,
|
| 23 |
+
"rope_scaling": null,
|
| 24 |
+
"rope_theta": 500000,
|
| 25 |
+
"tie_word_embeddings": false,
|
| 26 |
+
"torch_dtype": "float32",
|
| 27 |
+
"transformers_version": "4.53.0.dev0",
|
| 28 |
+
"use_cache": true,
|
| 29 |
+
"vocab_size": 250112
|
| 30 |
+
}
|
iter_1249000/eval_results/arc_challenge_25shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T19-57-20.454173.json
ADDED
|
@@ -0,0 +1,125 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"arc_challenge": {
|
| 4 |
+
"alias": "arc_challenge",
|
| 5 |
+
"acc,none": 0.6083617747440273,
|
| 6 |
+
"acc_stderr,none": 0.014264122124938267,
|
| 7 |
+
"acc_norm,none": 0.6484641638225256,
|
| 8 |
+
"acc_norm_stderr,none": 0.013952413699601044
|
| 9 |
+
}
|
| 10 |
+
},
|
| 11 |
+
"group_subtasks": {
|
| 12 |
+
"arc_challenge": []
|
| 13 |
+
},
|
| 14 |
+
"configs": {
|
| 15 |
+
"arc_challenge": {
|
| 16 |
+
"task": "arc_challenge",
|
| 17 |
+
"tag": [
|
| 18 |
+
"ai2_arc"
|
| 19 |
+
],
|
| 20 |
+
"dataset_path": "allenai/ai2_arc",
|
| 21 |
+
"dataset_name": "ARC-Challenge",
|
| 22 |
+
"training_split": "train",
|
| 23 |
+
"validation_split": "validation",
|
| 24 |
+
"test_split": "test",
|
| 25 |
+
"doc_to_text": "Question: {{question}}\nAnswer:",
|
| 26 |
+
"doc_to_target": "{{choices.label.index(answerKey)}}",
|
| 27 |
+
"unsafe_code": false,
|
| 28 |
+
"doc_to_choice": "{{choices.text}}",
|
| 29 |
+
"description": "",
|
| 30 |
+
"target_delimiter": " ",
|
| 31 |
+
"fewshot_delimiter": "\n\n",
|
| 32 |
+
"num_fewshot": 25,
|
| 33 |
+
"metric_list": [
|
| 34 |
+
{
|
| 35 |
+
"metric": "acc",
|
| 36 |
+
"aggregation": "mean",
|
| 37 |
+
"higher_is_better": true
|
| 38 |
+
},
|
| 39 |
+
{
|
| 40 |
+
"metric": "acc_norm",
|
| 41 |
+
"aggregation": "mean",
|
| 42 |
+
"higher_is_better": true
|
| 43 |
+
}
|
| 44 |
+
],
|
| 45 |
+
"output_type": "multiple_choice",
|
| 46 |
+
"repeats": 1,
|
| 47 |
+
"should_decontaminate": true,
|
| 48 |
+
"doc_to_decontamination_query": "Question: {{question}}\nAnswer:",
|
| 49 |
+
"metadata": {
|
| 50 |
+
"version": 1.0,
|
| 51 |
+
"pretrained": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
|
| 52 |
+
"tensor_parallel_size": 8,
|
| 53 |
+
"dtype": "float32",
|
| 54 |
+
"gpu_memory_utilization": 0.8
|
| 55 |
+
}
|
| 56 |
+
}
|
| 57 |
+
},
|
| 58 |
+
"versions": {
|
| 59 |
+
"arc_challenge": 1.0
|
| 60 |
+
},
|
| 61 |
+
"n-shot": {
|
| 62 |
+
"arc_challenge": 25
|
| 63 |
+
},
|
| 64 |
+
"higher_is_better": {
|
| 65 |
+
"arc_challenge": {
|
| 66 |
+
"acc": true,
|
| 67 |
+
"acc_norm": true
|
| 68 |
+
}
|
| 69 |
+
},
|
| 70 |
+
"n-samples": {
|
| 71 |
+
"arc_challenge": {
|
| 72 |
+
"original": 1172,
|
| 73 |
+
"effective": 1172
|
| 74 |
+
}
|
| 75 |
+
},
|
| 76 |
+
"config": {
|
| 77 |
+
"model": "vllm",
|
| 78 |
+
"model_args": "pretrained=/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000,tensor_parallel_size=8,dtype=float32,gpu_memory_utilization=0.8",
|
| 79 |
+
"batch_size": "1",
|
| 80 |
+
"batch_sizes": [],
|
| 81 |
+
"device": null,
|
| 82 |
+
"use_cache": null,
|
| 83 |
+
"limit": null,
|
| 84 |
+
"bootstrap_iters": 100000,
|
| 85 |
+
"gen_kwargs": null,
|
| 86 |
+
"random_seed": 0,
|
| 87 |
+
"numpy_seed": 1234,
|
| 88 |
+
"torch_seed": 1234,
|
| 89 |
+
"fewshot_seed": 1234
|
| 90 |
+
},
|
| 91 |
+
"git_hash": "a445a07",
|
| 92 |
+
"date": 1752779715.342989,
|
| 93 |
+
"pretty_env_info": "PyTorch version: 2.7.0+cu126\nIs debug build: False\nCUDA used to build PyTorch: 12.6\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.31.7\nLibc version: glibc-2.35\n\nPython version: 3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:12:24) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1088-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H200\nGPU 1: NVIDIA H200\nGPU 2: NVIDIA H200\nGPU 3: NVIDIA H200\nGPU 4: NVIDIA H200\nGPU 5: NVIDIA H200\nGPU 6: NVIDIA H200\nGPU 7: NVIDIA H200\n\nNvidia driver version: 570.133.20\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8480C\nCPU family: 6\nModel: 143\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 8\nBogoMIPS: 4000.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 avx512vbmi umip waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid cldemote movdiri movdir64b fsrm serialize amx_bf16 avx512_fp16 amx_tile amx_int8 arch_capabilities\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 4.5 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 192 MiB (96 instances)\nL3 cache: 210 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47\nNUMA node1 CPU(s): 48-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Unknown: No mitigations\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Vulnerable\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Retpoline\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] flake8==7.0.0\n[pip3] flashinfer-python==0.2.5+cu126torch2.6\n[pip3] mypy==1.10.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] numpydoc==1.7.0\n[pip3] nvidia-cublas-cu12==12.6.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.6.80\n[pip3] nvidia-cuda-nvrtc-cu12==12.6.77\n[pip3] nvidia-cuda-runtime-cu12==12.6.77\n[pip3] nvidia-cudnn-cu12==9.5.1.17\n[pip3] nvidia-cufft-cu12==11.3.0.4\n[pip3] nvidia-curand-cu12==10.3.7.77\n[pip3] nvidia-cusolver-cu12==11.7.1.2\n[pip3] nvidia-cusparse-cu12==12.5.4.2\n[pip3] nvidia-cusparselt-cu12==0.6.3\n[pip3] nvidia-nccl-cu12==2.26.2\n[pip3] nvidia-nvjitlink-cu12==12.6.85\n[pip3] nvidia-nvtx-cu12==12.6.77\n[pip3] torch==2.7.0\n[pip3] torchaudio==2.7.0\n[pip3] torchvision==0.22.0\n[pip3] triton==3.3.0\n[conda] _anaconda_depends 2024.06 py312_mkl_2 \n[conda] blas 1.0 mkl \n[conda] flashinfer-python 0.2.5+cu126torch2.6 pypi_0 pypi\n[conda] mkl 2023.1.0 h213fc3f_46344 \n[conda] mkl-service 2.4.0 py312h5eee18b_1 \n[conda] mkl_fft 1.3.8 py312h5eee18b_0 \n[conda] mkl_random 1.2.4 py312hdb19cb5_0 \n[conda] numpy 1.26.4 py312hc5e2394_0 \n[conda] numpy-base 1.26.4 py312h0da6c21_0 \n[conda] numpydoc 1.7.0 py312h06a4308_0 \n[conda] nvidia-cublas-cu12 12.6.4.1 pypi_0 pypi\n[conda] nvidia-cuda-cupti-cu12 12.6.80 pypi_0 pypi\n[conda] nvidia-cuda-nvrtc-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cuda-runtime-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cudnn-cu12 9.5.1.17 pypi_0 pypi\n[conda] nvidia-cufft-cu12 11.3.0.4 pypi_0 pypi\n[conda] nvidia-curand-cu12 10.3.7.77 pypi_0 pypi\n[conda] nvidia-cusolver-cu12 11.7.1.2 pypi_0 pypi\n[conda] nvidia-cusparse-cu12 12.5.4.2 pypi_0 pypi\n[conda] nvidia-cusparselt-cu12 0.6.3 pypi_0 pypi\n[conda] nvidia-nccl-cu12 2.26.2 pypi_0 pypi\n[conda] nvidia-nvjitlink-cu12 12.6.85 pypi_0 pypi\n[conda] nvidia-nvtx-cu12 12.6.77 pypi_0 pypi\n[conda] torch 2.7.0 pypi_0 pypi\n[conda] torchaudio 2.7.0 pypi_0 pypi\n[conda] torchvision 0.22.0 pypi_0 pypi\n[conda] triton 3.3.0 pypi_0 pypi",
|
| 94 |
+
"transformers_version": "4.53.0.dev0",
|
| 95 |
+
"lm_eval_version": "0.4.8",
|
| 96 |
+
"upper_git_hash": null,
|
| 97 |
+
"tokenizer_pad_token": [
|
| 98 |
+
"<|end_of_text|>",
|
| 99 |
+
"1"
|
| 100 |
+
],
|
| 101 |
+
"tokenizer_eos_token": [
|
| 102 |
+
"<|end_of_text|>",
|
| 103 |
+
"1"
|
| 104 |
+
],
|
| 105 |
+
"tokenizer_bos_token": [
|
| 106 |
+
"<|begin_of_text|>",
|
| 107 |
+
"0"
|
| 108 |
+
],
|
| 109 |
+
"eot_token_id": 1,
|
| 110 |
+
"max_length": 32768,
|
| 111 |
+
"task_hashes": {
|
| 112 |
+
"arc_challenge": "55e883475b5650b20d8d9dc1e9cdf59ef645a257fcd74bf43a9dbb5c632c529c"
|
| 113 |
+
},
|
| 114 |
+
"model_source": "vllm",
|
| 115 |
+
"model_name": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
|
| 116 |
+
"model_name_sanitized": "__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000",
|
| 117 |
+
"system_instruction": null,
|
| 118 |
+
"system_instruction_sha": null,
|
| 119 |
+
"fewshot_as_multiturn": false,
|
| 120 |
+
"chat_template": null,
|
| 121 |
+
"chat_template_sha": null,
|
| 122 |
+
"start_time": 1470957.02656259,
|
| 123 |
+
"end_time": 1473500.429686519,
|
| 124 |
+
"total_evaluation_time_seconds": "2543.4031239291653"
|
| 125 |
+
}
|
iter_1249000/eval_results/arc_challenge_25shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_arc_challenge_2025-07-17T19-57-20.454173.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ba6a3cf4ac05ff39bae31ead5ab97e287408840565822eebb65b974ed78fb874
|
| 3 |
+
size 23433988
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T20-18-56.328280.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_boolean_expressions_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_causal_judgement_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_date_understanding_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_disambiguation_qa_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_dyck_languages_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_formal_fallacies_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_geometric_shapes_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_hyperbaton_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_logical_deduction_five_objects_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_logical_deduction_seven_objects_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_logical_deduction_three_objects_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_movie_recommendation_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_multistep_arithmetic_two_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_navigate_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_object_counting_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_penguins_in_a_table_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_reasoning_about_colored_objects_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_ruin_names_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_salient_translation_error_detection_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_snarks_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_sports_understanding_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_temporal_sequences_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_tracking_shuffled_objects_five_objects_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_tracking_shuffled_objects_seven_objects_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_tracking_shuffled_objects_three_objects_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_web_of_lies_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/bbh_3shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_bbh_cot_fewshot_word_sorting_2025-07-17T20-18-56.328280.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/gsm8k_5shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T20-44-20.876997.json
ADDED
|
@@ -0,0 +1,161 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"gsm8k": {
|
| 4 |
+
"alias": "gsm8k",
|
| 5 |
+
"exact_match,strict-match": 0.6785443517816527,
|
| 6 |
+
"exact_match_stderr,strict-match": 0.012864471384836705,
|
| 7 |
+
"exact_match,flexible-extract": 0.6800606520090978,
|
| 8 |
+
"exact_match_stderr,flexible-extract": 0.012848426555240763
|
| 9 |
+
}
|
| 10 |
+
},
|
| 11 |
+
"group_subtasks": {
|
| 12 |
+
"gsm8k": []
|
| 13 |
+
},
|
| 14 |
+
"configs": {
|
| 15 |
+
"gsm8k": {
|
| 16 |
+
"task": "gsm8k",
|
| 17 |
+
"tag": [
|
| 18 |
+
"math_word_problems"
|
| 19 |
+
],
|
| 20 |
+
"dataset_path": "gsm8k",
|
| 21 |
+
"dataset_name": "main",
|
| 22 |
+
"training_split": "train",
|
| 23 |
+
"test_split": "test",
|
| 24 |
+
"fewshot_split": "train",
|
| 25 |
+
"doc_to_text": "Question: {{question}}\nAnswer:",
|
| 26 |
+
"doc_to_target": "{{answer}}",
|
| 27 |
+
"unsafe_code": false,
|
| 28 |
+
"description": "",
|
| 29 |
+
"target_delimiter": " ",
|
| 30 |
+
"fewshot_delimiter": "\n\n",
|
| 31 |
+
"num_fewshot": 5,
|
| 32 |
+
"metric_list": [
|
| 33 |
+
{
|
| 34 |
+
"metric": "exact_match",
|
| 35 |
+
"aggregation": "mean",
|
| 36 |
+
"higher_is_better": true,
|
| 37 |
+
"ignore_case": true,
|
| 38 |
+
"ignore_punctuation": false,
|
| 39 |
+
"regexes_to_ignore": [
|
| 40 |
+
",",
|
| 41 |
+
"\\$",
|
| 42 |
+
"(?s).*#### ",
|
| 43 |
+
"\\.$"
|
| 44 |
+
]
|
| 45 |
+
}
|
| 46 |
+
],
|
| 47 |
+
"output_type": "generate_until",
|
| 48 |
+
"generation_kwargs": {
|
| 49 |
+
"until": [
|
| 50 |
+
"Question:",
|
| 51 |
+
"</s>",
|
| 52 |
+
"<|im_end|>"
|
| 53 |
+
],
|
| 54 |
+
"do_sample": false,
|
| 55 |
+
"temperature": 0.0
|
| 56 |
+
},
|
| 57 |
+
"repeats": 1,
|
| 58 |
+
"filter_list": [
|
| 59 |
+
{
|
| 60 |
+
"name": "strict-match",
|
| 61 |
+
"filter": [
|
| 62 |
+
{
|
| 63 |
+
"function": "regex",
|
| 64 |
+
"regex_pattern": "#### (\\-?[0-9\\.\\,]+)"
|
| 65 |
+
},
|
| 66 |
+
{
|
| 67 |
+
"function": "take_first"
|
| 68 |
+
}
|
| 69 |
+
]
|
| 70 |
+
},
|
| 71 |
+
{
|
| 72 |
+
"name": "flexible-extract",
|
| 73 |
+
"filter": [
|
| 74 |
+
{
|
| 75 |
+
"function": "regex",
|
| 76 |
+
"group_select": -1,
|
| 77 |
+
"regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
|
| 78 |
+
},
|
| 79 |
+
{
|
| 80 |
+
"function": "take_first"
|
| 81 |
+
}
|
| 82 |
+
]
|
| 83 |
+
}
|
| 84 |
+
],
|
| 85 |
+
"should_decontaminate": false,
|
| 86 |
+
"metadata": {
|
| 87 |
+
"version": 3.0,
|
| 88 |
+
"pretrained": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
|
| 89 |
+
"tensor_parallel_size": 8,
|
| 90 |
+
"dtype": "float32",
|
| 91 |
+
"gpu_memory_utilization": 0.8
|
| 92 |
+
}
|
| 93 |
+
}
|
| 94 |
+
},
|
| 95 |
+
"versions": {
|
| 96 |
+
"gsm8k": 3.0
|
| 97 |
+
},
|
| 98 |
+
"n-shot": {
|
| 99 |
+
"gsm8k": 5
|
| 100 |
+
},
|
| 101 |
+
"higher_is_better": {
|
| 102 |
+
"gsm8k": {
|
| 103 |
+
"exact_match": true
|
| 104 |
+
}
|
| 105 |
+
},
|
| 106 |
+
"n-samples": {
|
| 107 |
+
"gsm8k": {
|
| 108 |
+
"original": 1319,
|
| 109 |
+
"effective": 1319
|
| 110 |
+
}
|
| 111 |
+
},
|
| 112 |
+
"config": {
|
| 113 |
+
"model": "vllm",
|
| 114 |
+
"model_args": "pretrained=/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000,tensor_parallel_size=8,dtype=float32,gpu_memory_utilization=0.8",
|
| 115 |
+
"batch_size": "1",
|
| 116 |
+
"batch_sizes": [],
|
| 117 |
+
"device": null,
|
| 118 |
+
"use_cache": null,
|
| 119 |
+
"limit": null,
|
| 120 |
+
"bootstrap_iters": 100000,
|
| 121 |
+
"gen_kwargs": null,
|
| 122 |
+
"random_seed": 0,
|
| 123 |
+
"numpy_seed": 1234,
|
| 124 |
+
"torch_seed": 1234,
|
| 125 |
+
"fewshot_seed": 1234
|
| 126 |
+
},
|
| 127 |
+
"git_hash": "a445a07",
|
| 128 |
+
"date": 1752782267.9471405,
|
| 129 |
+
"pretty_env_info": "PyTorch version: 2.7.0+cu126\nIs debug build: False\nCUDA used to build PyTorch: 12.6\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.31.7\nLibc version: glibc-2.35\n\nPython version: 3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:12:24) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1088-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H200\nGPU 1: NVIDIA H200\nGPU 2: NVIDIA H200\nGPU 3: NVIDIA H200\nGPU 4: NVIDIA H200\nGPU 5: NVIDIA H200\nGPU 6: NVIDIA H200\nGPU 7: NVIDIA H200\n\nNvidia driver version: 570.133.20\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8480C\nCPU family: 6\nModel: 143\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 8\nBogoMIPS: 4000.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 avx512vbmi umip waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid cldemote movdiri movdir64b fsrm serialize amx_bf16 avx512_fp16 amx_tile amx_int8 arch_capabilities\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 4.5 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 192 MiB (96 instances)\nL3 cache: 210 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47\nNUMA node1 CPU(s): 48-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Unknown: No mitigations\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Vulnerable\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Retpoline\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] flake8==7.0.0\n[pip3] flashinfer-python==0.2.5+cu126torch2.6\n[pip3] mypy==1.10.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] numpydoc==1.7.0\n[pip3] nvidia-cublas-cu12==12.6.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.6.80\n[pip3] nvidia-cuda-nvrtc-cu12==12.6.77\n[pip3] nvidia-cuda-runtime-cu12==12.6.77\n[pip3] nvidia-cudnn-cu12==9.5.1.17\n[pip3] nvidia-cufft-cu12==11.3.0.4\n[pip3] nvidia-curand-cu12==10.3.7.77\n[pip3] nvidia-cusolver-cu12==11.7.1.2\n[pip3] nvidia-cusparse-cu12==12.5.4.2\n[pip3] nvidia-cusparselt-cu12==0.6.3\n[pip3] nvidia-nccl-cu12==2.26.2\n[pip3] nvidia-nvjitlink-cu12==12.6.85\n[pip3] nvidia-nvtx-cu12==12.6.77\n[pip3] torch==2.7.0\n[pip3] torchaudio==2.7.0\n[pip3] torchvision==0.22.0\n[pip3] triton==3.3.0\n[conda] _anaconda_depends 2024.06 py312_mkl_2 \n[conda] blas 1.0 mkl \n[conda] flashinfer-python 0.2.5+cu126torch2.6 pypi_0 pypi\n[conda] mkl 2023.1.0 h213fc3f_46344 \n[conda] mkl-service 2.4.0 py312h5eee18b_1 \n[conda] mkl_fft 1.3.8 py312h5eee18b_0 \n[conda] mkl_random 1.2.4 py312hdb19cb5_0 \n[conda] numpy 1.26.4 py312hc5e2394_0 \n[conda] numpy-base 1.26.4 py312h0da6c21_0 \n[conda] numpydoc 1.7.0 py312h06a4308_0 \n[conda] nvidia-cublas-cu12 12.6.4.1 pypi_0 pypi\n[conda] nvidia-cuda-cupti-cu12 12.6.80 pypi_0 pypi\n[conda] nvidia-cuda-nvrtc-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cuda-runtime-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cudnn-cu12 9.5.1.17 pypi_0 pypi\n[conda] nvidia-cufft-cu12 11.3.0.4 pypi_0 pypi\n[conda] nvidia-curand-cu12 10.3.7.77 pypi_0 pypi\n[conda] nvidia-cusolver-cu12 11.7.1.2 pypi_0 pypi\n[conda] nvidia-cusparse-cu12 12.5.4.2 pypi_0 pypi\n[conda] nvidia-cusparselt-cu12 0.6.3 pypi_0 pypi\n[conda] nvidia-nccl-cu12 2.26.2 pypi_0 pypi\n[conda] nvidia-nvjitlink-cu12 12.6.85 pypi_0 pypi\n[conda] nvidia-nvtx-cu12 12.6.77 pypi_0 pypi\n[conda] torch 2.7.0 pypi_0 pypi\n[conda] torchaudio 2.7.0 pypi_0 pypi\n[conda] torchvision 0.22.0 pypi_0 pypi\n[conda] triton 3.3.0 pypi_0 pypi",
|
| 130 |
+
"transformers_version": "4.53.0.dev0",
|
| 131 |
+
"lm_eval_version": "0.4.8",
|
| 132 |
+
"upper_git_hash": null,
|
| 133 |
+
"tokenizer_pad_token": [
|
| 134 |
+
"<|end_of_text|>",
|
| 135 |
+
"1"
|
| 136 |
+
],
|
| 137 |
+
"tokenizer_eos_token": [
|
| 138 |
+
"<|end_of_text|>",
|
| 139 |
+
"1"
|
| 140 |
+
],
|
| 141 |
+
"tokenizer_bos_token": [
|
| 142 |
+
"<|begin_of_text|>",
|
| 143 |
+
"0"
|
| 144 |
+
],
|
| 145 |
+
"eot_token_id": 1,
|
| 146 |
+
"max_length": 32768,
|
| 147 |
+
"task_hashes": {
|
| 148 |
+
"gsm8k": "2330f4ebfcccaf66a892922df2819cdb1f118e448d076d3f42bdde4177678ac7"
|
| 149 |
+
},
|
| 150 |
+
"model_source": "vllm",
|
| 151 |
+
"model_name": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
|
| 152 |
+
"model_name_sanitized": "__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000",
|
| 153 |
+
"system_instruction": null,
|
| 154 |
+
"system_instruction_sha": null,
|
| 155 |
+
"fewshot_as_multiturn": false,
|
| 156 |
+
"chat_template": null,
|
| 157 |
+
"chat_template_sha": null,
|
| 158 |
+
"start_time": 1473522.659870797,
|
| 159 |
+
"end_time": 1476320.847624145,
|
| 160 |
+
"total_evaluation_time_seconds": "2798.187753347913"
|
| 161 |
+
}
|
iter_1249000/eval_results/gsm8k_5shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_gsm8k_2025-07-17T20-44-20.876997.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:356b69e32a7269557dd4d9232d1880291c698fe6fa684df598cb151b91c1a312
|
| 3 |
+
size 12375232
|
iter_1249000/eval_results/gsm8k_cot_8shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-08-07T20-33-19.023288.json
ADDED
|
@@ -0,0 +1,195 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"gsm8k_cot": {
|
| 4 |
+
"alias": "gsm8k_cot",
|
| 5 |
+
"exact_match,strict-match": 0.5852918877937832,
|
| 6 |
+
"exact_match_stderr,strict-match": 0.01357062384230451,
|
| 7 |
+
"exact_match,flexible-extract": 0.6322971948445792,
|
| 8 |
+
"exact_match_stderr,flexible-extract": 0.01328163050339548
|
| 9 |
+
}
|
| 10 |
+
},
|
| 11 |
+
"group_subtasks": {
|
| 12 |
+
"gsm8k_cot": []
|
| 13 |
+
},
|
| 14 |
+
"configs": {
|
| 15 |
+
"gsm8k_cot": {
|
| 16 |
+
"task": "gsm8k_cot",
|
| 17 |
+
"tag": [
|
| 18 |
+
"chain_of_thought"
|
| 19 |
+
],
|
| 20 |
+
"dataset_path": "gsm8k",
|
| 21 |
+
"dataset_name": "main",
|
| 22 |
+
"test_split": "test",
|
| 23 |
+
"doc_to_text": "Q: {{question}}\nA:",
|
| 24 |
+
"doc_to_target": "{{answer.split('####')[-1].strip() if answer is defined else target}}",
|
| 25 |
+
"unsafe_code": false,
|
| 26 |
+
"description": "",
|
| 27 |
+
"target_delimiter": " ",
|
| 28 |
+
"fewshot_delimiter": "\n\n",
|
| 29 |
+
"fewshot_config": {
|
| 30 |
+
"sampler": "first_n",
|
| 31 |
+
"samples": [
|
| 32 |
+
{
|
| 33 |
+
"question": "There are 15 trees in the grove. Grove workers will plant trees in the grove today. After they are done, there will be 21 trees. How many trees did the grove workers plant today?",
|
| 34 |
+
"target": "There are 15 trees originally. Then there were 21 trees after some more were planted. So there must have been 21 - 15 = 6. The answer is 6."
|
| 35 |
+
},
|
| 36 |
+
{
|
| 37 |
+
"question": "If there are 3 cars in the parking lot and 2 more cars arrive, how many cars are in the parking lot?",
|
| 38 |
+
"target": "There are originally 3 cars. 2 more cars arrive. 3 + 2 = 5. The answer is 5."
|
| 39 |
+
},
|
| 40 |
+
{
|
| 41 |
+
"question": "Leah had 32 chocolates and her sister had 42. If they ate 35, how many pieces do they have left in total?",
|
| 42 |
+
"target": "Originally, Leah had 32 chocolates. Her sister had 42. So in total they had 32 + 42 = 74. After eating 35, they had 74 - 35 = 39. The answer is 39."
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"question": "Jason had 20 lollipops. He gave Denny some lollipops. Now Jason has 12 lollipops. How many lollipops did Jason give to Denny?",
|
| 46 |
+
"target": "Jason started with 20 lollipops. Then he had 12 after giving some to Denny. So he gave Denny 20 - 12 = 8. The answer is 8."
|
| 47 |
+
},
|
| 48 |
+
{
|
| 49 |
+
"question": "Shawn has five toys. For Christmas, he got two toys each from his mom and dad. How many toys does he have now?",
|
| 50 |
+
"target": "Shawn started with 5 toys. If he got 2 toys each from his mom and dad, then that is 4 more toys. 5 + 4 = 9. The answer is 9."
|
| 51 |
+
},
|
| 52 |
+
{
|
| 53 |
+
"question": "There were nine computers in the server room. Five more computers were installed each day, from monday to thursday. How many computers are now in the server room?",
|
| 54 |
+
"target": "There were originally 9 computers. For each of 4 days, 5 more computers were added. So 5 * 4 = 20 computers were added. 9 + 20 is 29. The answer is 29."
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"question": "Michael had 58 golf balls. On tuesday, he lost 23 golf balls. On wednesday, he lost 2 more. How many golf balls did he have at the end of wednesday?",
|
| 58 |
+
"target": "Michael started with 58 golf balls. After losing 23 on tuesday, he had 58 - 23 = 35. After losing 2 more, he had 35 - 2 = 33 golf balls. The answer is 33."
|
| 59 |
+
},
|
| 60 |
+
{
|
| 61 |
+
"question": "Olivia has $23. She bought five bagels for $3 each. How much money does she have left?",
|
| 62 |
+
"target": "Olivia had 23 dollars. 5 bagels for 3 dollars each will be 5 x 3 = 15 dollars. So she has 23 - 15 dollars left. 23 - 15 is 8. The answer is 8."
|
| 63 |
+
}
|
| 64 |
+
]
|
| 65 |
+
},
|
| 66 |
+
"num_fewshot": 8,
|
| 67 |
+
"metric_list": [
|
| 68 |
+
{
|
| 69 |
+
"aggregation": "mean",
|
| 70 |
+
"higher_is_better": true,
|
| 71 |
+
"ignore_case": true,
|
| 72 |
+
"ignore_punctuation": false,
|
| 73 |
+
"metric": "exact_match",
|
| 74 |
+
"regexes_to_ignore": [
|
| 75 |
+
",",
|
| 76 |
+
"\\$",
|
| 77 |
+
"(?s).*#### ",
|
| 78 |
+
"\\.$"
|
| 79 |
+
]
|
| 80 |
+
}
|
| 81 |
+
],
|
| 82 |
+
"output_type": "generate_until",
|
| 83 |
+
"generation_kwargs": {
|
| 84 |
+
"do_sample": false,
|
| 85 |
+
"until": [
|
| 86 |
+
"Q:",
|
| 87 |
+
"</s>",
|
| 88 |
+
"<|im_end|>"
|
| 89 |
+
]
|
| 90 |
+
},
|
| 91 |
+
"repeats": 1,
|
| 92 |
+
"filter_list": [
|
| 93 |
+
{
|
| 94 |
+
"filter": [
|
| 95 |
+
{
|
| 96 |
+
"function": "regex",
|
| 97 |
+
"regex_pattern": "The answer is (\\-?[0-9\\.\\,]+)."
|
| 98 |
+
},
|
| 99 |
+
{
|
| 100 |
+
"function": "take_first"
|
| 101 |
+
}
|
| 102 |
+
],
|
| 103 |
+
"name": "strict-match"
|
| 104 |
+
},
|
| 105 |
+
{
|
| 106 |
+
"filter": [
|
| 107 |
+
{
|
| 108 |
+
"function": "regex",
|
| 109 |
+
"group_select": -1,
|
| 110 |
+
"regex_pattern": "(-?[$0-9.,]{2,})|(-?[0-9]+)"
|
| 111 |
+
},
|
| 112 |
+
{
|
| 113 |
+
"function": "take_first"
|
| 114 |
+
}
|
| 115 |
+
],
|
| 116 |
+
"name": "flexible-extract"
|
| 117 |
+
}
|
| 118 |
+
],
|
| 119 |
+
"should_decontaminate": false,
|
| 120 |
+
"metadata": {
|
| 121 |
+
"version": 3.0,
|
| 122 |
+
"pretrained": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
|
| 123 |
+
"tensor_parallel_size": 8,
|
| 124 |
+
"dtype": "float32",
|
| 125 |
+
"gpu_memory_utilization": 0.7
|
| 126 |
+
}
|
| 127 |
+
}
|
| 128 |
+
},
|
| 129 |
+
"versions": {
|
| 130 |
+
"gsm8k_cot": 3.0
|
| 131 |
+
},
|
| 132 |
+
"n-shot": {
|
| 133 |
+
"gsm8k_cot": 8
|
| 134 |
+
},
|
| 135 |
+
"higher_is_better": {
|
| 136 |
+
"gsm8k_cot": {
|
| 137 |
+
"exact_match": true
|
| 138 |
+
}
|
| 139 |
+
},
|
| 140 |
+
"n-samples": {
|
| 141 |
+
"gsm8k_cot": {
|
| 142 |
+
"original": 1319,
|
| 143 |
+
"effective": 1319
|
| 144 |
+
}
|
| 145 |
+
},
|
| 146 |
+
"config": {
|
| 147 |
+
"model": "vllm",
|
| 148 |
+
"model_args": "pretrained=/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000,tensor_parallel_size=8,dtype=float32,gpu_memory_utilization=0.7",
|
| 149 |
+
"batch_size": "1",
|
| 150 |
+
"batch_sizes": [],
|
| 151 |
+
"device": null,
|
| 152 |
+
"use_cache": null,
|
| 153 |
+
"limit": null,
|
| 154 |
+
"bootstrap_iters": 100000,
|
| 155 |
+
"gen_kwargs": null,
|
| 156 |
+
"random_seed": 0,
|
| 157 |
+
"numpy_seed": 1234,
|
| 158 |
+
"torch_seed": 1234,
|
| 159 |
+
"fewshot_seed": 1234
|
| 160 |
+
},
|
| 161 |
+
"git_hash": "18965e2",
|
| 162 |
+
"date": 1754596227.0581243,
|
| 163 |
+
"pretty_env_info": "PyTorch version: 2.7.0+cu126\nIs debug build: False\nCUDA used to build PyTorch: 12.6\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.31.7\nLibc version: glibc-2.35\n\nPython version: 3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:12:24) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1088-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H200\nGPU 1: NVIDIA H200\nGPU 2: NVIDIA H200\nGPU 3: NVIDIA H200\nGPU 4: NVIDIA H200\nGPU 5: NVIDIA H200\nGPU 6: NVIDIA H200\nGPU 7: NVIDIA H200\n\nNvidia driver version: 570.133.20\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8480C\nCPU family: 6\nModel: 143\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 8\nBogoMIPS: 4000.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 avx512vbmi umip waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid cldemote movdiri movdir64b fsrm serialize amx_bf16 avx512_fp16 amx_tile amx_int8 arch_capabilities\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 4.5 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 192 MiB (96 instances)\nL3 cache: 210 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47\nNUMA node1 CPU(s): 48-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Unknown: No mitigations\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Vulnerable\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Retpoline\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] flake8==7.0.0\n[pip3] flashinfer-python==0.2.5+cu126torch2.6\n[pip3] mypy==1.10.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] numpydoc==1.7.0\n[pip3] nvidia-cublas-cu12==12.6.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.6.80\n[pip3] nvidia-cuda-nvrtc-cu12==12.6.77\n[pip3] nvidia-cuda-runtime-cu12==12.6.77\n[pip3] nvidia-cudnn-cu12==9.5.1.17\n[pip3] nvidia-cufft-cu12==11.3.0.4\n[pip3] nvidia-curand-cu12==10.3.7.77\n[pip3] nvidia-cusolver-cu12==11.7.1.2\n[pip3] nvidia-cusparse-cu12==12.5.4.2\n[pip3] nvidia-cusparselt-cu12==0.6.3\n[pip3] nvidia-nccl-cu12==2.26.2\n[pip3] nvidia-nvjitlink-cu12==12.6.85\n[pip3] nvidia-nvtx-cu12==12.6.77\n[pip3] torch==2.7.0\n[pip3] torchaudio==2.7.0\n[pip3] torchvision==0.22.0\n[pip3] triton==3.3.0\n[conda] _anaconda_depends 2024.06 py312_mkl_2 \n[conda] blas 1.0 mkl \n[conda] flashinfer-python 0.2.5+cu126torch2.6 pypi_0 pypi\n[conda] mkl 2023.1.0 h213fc3f_46344 \n[conda] mkl-service 2.4.0 py312h5eee18b_1 \n[conda] mkl_fft 1.3.8 py312h5eee18b_0 \n[conda] mkl_random 1.2.4 py312hdb19cb5_0 \n[conda] numpy 1.26.4 py312hc5e2394_0 \n[conda] numpy-base 1.26.4 py312h0da6c21_0 \n[conda] numpydoc 1.7.0 py312h06a4308_0 \n[conda] nvidia-cublas-cu12 12.6.4.1 pypi_0 pypi\n[conda] nvidia-cuda-cupti-cu12 12.6.80 pypi_0 pypi\n[conda] nvidia-cuda-nvrtc-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cuda-runtime-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cudnn-cu12 9.5.1.17 pypi_0 pypi\n[conda] nvidia-cufft-cu12 11.3.0.4 pypi_0 pypi\n[conda] nvidia-curand-cu12 10.3.7.77 pypi_0 pypi\n[conda] nvidia-cusolver-cu12 11.7.1.2 pypi_0 pypi\n[conda] nvidia-cusparse-cu12 12.5.4.2 pypi_0 pypi\n[conda] nvidia-cusparselt-cu12 0.6.3 pypi_0 pypi\n[conda] nvidia-nccl-cu12 2.26.2 pypi_0 pypi\n[conda] nvidia-nvjitlink-cu12 12.6.85 pypi_0 pypi\n[conda] nvidia-nvtx-cu12 12.6.77 pypi_0 pypi\n[conda] torch 2.7.0 pypi_0 pypi\n[conda] torchaudio 2.7.0 pypi_0 pypi\n[conda] torchvision 0.22.0 pypi_0 pypi\n[conda] triton 3.3.0 pypi_0 pypi",
|
| 164 |
+
"transformers_version": "4.53.0.dev0",
|
| 165 |
+
"lm_eval_version": "0.4.8",
|
| 166 |
+
"upper_git_hash": null,
|
| 167 |
+
"tokenizer_pad_token": [
|
| 168 |
+
"<|end_of_text|>",
|
| 169 |
+
"1"
|
| 170 |
+
],
|
| 171 |
+
"tokenizer_eos_token": [
|
| 172 |
+
"<|end_of_text|>",
|
| 173 |
+
"1"
|
| 174 |
+
],
|
| 175 |
+
"tokenizer_bos_token": [
|
| 176 |
+
"<|begin_of_text|>",
|
| 177 |
+
"0"
|
| 178 |
+
],
|
| 179 |
+
"eot_token_id": 1,
|
| 180 |
+
"max_length": 32768,
|
| 181 |
+
"task_hashes": {
|
| 182 |
+
"gsm8k_cot": "fc360963b39ee52c26a82795124f9ad7da4d6a8fecf1b77e2502823b1669b3d0"
|
| 183 |
+
},
|
| 184 |
+
"model_source": "vllm",
|
| 185 |
+
"model_name": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
|
| 186 |
+
"model_name_sanitized": "__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000",
|
| 187 |
+
"system_instruction": null,
|
| 188 |
+
"system_instruction_sha": null,
|
| 189 |
+
"fewshot_as_multiturn": false,
|
| 190 |
+
"chat_template": null,
|
| 191 |
+
"chat_template_sha": null,
|
| 192 |
+
"start_time": 3287464.513322936,
|
| 193 |
+
"end_time": 3290055.302254567,
|
| 194 |
+
"total_evaluation_time_seconds": "2590.7889316310175"
|
| 195 |
+
}
|
iter_1249000/eval_results/gsm8k_cot_8shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_gsm8k_cot_2025-08-07T20-33-19.023288.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/gsm8k_reasoning_base_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__/results_2025-11-26T00-59-11.177986.json
ADDED
|
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"gsm8k_reasoning_base": {
|
| 4 |
+
"alias": "gsm8k_reasoning_base",
|
| 5 |
+
"math_verify,none": 0.07808946171341925,
|
| 6 |
+
"math_verify_stderr,none": 0.007390654481108267
|
| 7 |
+
}
|
| 8 |
+
},
|
| 9 |
+
"group_subtasks": {
|
| 10 |
+
"gsm8k_reasoning_base": []
|
| 11 |
+
},
|
| 12 |
+
"configs": {
|
| 13 |
+
"gsm8k_reasoning_base": {
|
| 14 |
+
"task": "gsm8k_reasoning_base",
|
| 15 |
+
"tag": [
|
| 16 |
+
"math_word_problems"
|
| 17 |
+
],
|
| 18 |
+
"dataset_path": "gsm8k",
|
| 19 |
+
"dataset_name": "main",
|
| 20 |
+
"training_split": "train",
|
| 21 |
+
"test_split": "test",
|
| 22 |
+
"fewshot_split": "train",
|
| 23 |
+
"doc_to_text": "The user asks a question, and the Assistant solves it. The assistant first thinks about the reasoning process in the mind and then provides the user with the final answer. The reasoning process and answer are enclosed within <think> </think> and <answer> </answer> tags, respectively, i.e., <think> reasoning process here </think><answer> answer here </answer>. In the answer mention each unknown and its solution, for example, <answer> boxed{10} </answer>. Now the user asks you to solve a math reasoning problem.\n\nUser:{{question}}\nAssistant: <think>",
|
| 24 |
+
"doc_to_target": "{{answer.split('####')[-1].strip() if answer is defined else target}}",
|
| 25 |
+
"unsafe_code": false,
|
| 26 |
+
"process_results": "def process_results(doc: dict, results: List[str]) -> Dict[str, int]:\n candidates = results[0]\n\n # Extract from \\\\boxed{} if present\n boxed_answer = last_boxed_only_string(candidates)\n if boxed_answer is not None:\n try:\n boxed_content = remove_boxed(boxed_answer)\n if boxed_content is not None:\n candidates = boxed_content\n except (AssertionError, IndexError):\n pass\n\n # math_verify\n # print(\"=\"*100)\n # print(candidates, doc[\"answer\"].split('####')[-1].strip())\n # print(\"=\"*100)\n\n # Extract only digits from candidates\n raw_candidates = candidates\n if \"</answer>\" in candidates:\n candidates = candidates.split(\"</answer>\")[-2]\n if \"<answer>\" in candidates:\n candidates = candidates.split(\"<answer>\")[-1]\n candidates = candidates.split('=')[-1]\n if \"**\"in candidates:\n candidates = candidates.split(\"**\")[-2]\n if '$' in candidates:\n candidates = candidates.split('$')[-1]\n candidates = ''.join(c for c in candidates if c.isdigit() or c in \".-\")\n\n res = verify(parse(doc[\"answer\"].split('####')[-1].strip()), parse(candidates))\n mathval = 1 if res else 0\n if mathval == 0:\n gt = parse(doc[\"answer\"].split(\"####\")[-1].strip())\n print('=' * 80)\n print(f\"{parse(candidates)=}\")\n print(f\"{gt=}\")\n print(f\"{candidates=}\")\n print(f\"{repr(raw_candidates)=}\")\n\n results = {\n \"math_verify\": mathval,\n }\n return results\n",
|
| 27 |
+
"description": "",
|
| 28 |
+
"target_delimiter": " ",
|
| 29 |
+
"fewshot_delimiter": "\n\n",
|
| 30 |
+
"num_fewshot": 0,
|
| 31 |
+
"metric_list": [
|
| 32 |
+
{
|
| 33 |
+
"metric": "math_verify",
|
| 34 |
+
"aggregation": "mean",
|
| 35 |
+
"higher_is_better": true
|
| 36 |
+
}
|
| 37 |
+
],
|
| 38 |
+
"output_type": "generate_until",
|
| 39 |
+
"generation_kwargs": {
|
| 40 |
+
"until": [
|
| 41 |
+
"Question:",
|
| 42 |
+
"</s>",
|
| 43 |
+
"<|im_end|>"
|
| 44 |
+
],
|
| 45 |
+
"do_sample": true,
|
| 46 |
+
"temperature": 1.0,
|
| 47 |
+
"max_gen_toks": 32768
|
| 48 |
+
},
|
| 49 |
+
"repeats": 1,
|
| 50 |
+
"should_decontaminate": false,
|
| 51 |
+
"metadata": {
|
| 52 |
+
"version": 3.0,
|
| 53 |
+
"pretrained": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000/",
|
| 54 |
+
"tensor_parallel_size": 8,
|
| 55 |
+
"dtype": "float32",
|
| 56 |
+
"gpu_memory_utilization": 0.9
|
| 57 |
+
}
|
| 58 |
+
}
|
| 59 |
+
},
|
| 60 |
+
"versions": {
|
| 61 |
+
"gsm8k_reasoning_base": 3.0
|
| 62 |
+
},
|
| 63 |
+
"n-shot": {
|
| 64 |
+
"gsm8k_reasoning_base": 0
|
| 65 |
+
},
|
| 66 |
+
"higher_is_better": {
|
| 67 |
+
"gsm8k_reasoning_base": {
|
| 68 |
+
"math_verify": true
|
| 69 |
+
}
|
| 70 |
+
},
|
| 71 |
+
"n-samples": {
|
| 72 |
+
"gsm8k_reasoning_base": {
|
| 73 |
+
"original": 1319,
|
| 74 |
+
"effective": 1319
|
| 75 |
+
}
|
| 76 |
+
},
|
| 77 |
+
"config": {
|
| 78 |
+
"model": "vllm",
|
| 79 |
+
"model_args": "pretrained=/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000/,tensor_parallel_size=8,dtype=float32,gpu_memory_utilization=0.9",
|
| 80 |
+
"batch_size": "auto",
|
| 81 |
+
"batch_sizes": [],
|
| 82 |
+
"device": null,
|
| 83 |
+
"use_cache": null,
|
| 84 |
+
"limit": null,
|
| 85 |
+
"bootstrap_iters": 100000,
|
| 86 |
+
"gen_kwargs": {
|
| 87 |
+
"do_sample": true,
|
| 88 |
+
"temperature": 1.0,
|
| 89 |
+
"max_gen_toks": 32768
|
| 90 |
+
},
|
| 91 |
+
"random_seed": 0,
|
| 92 |
+
"numpy_seed": 1234,
|
| 93 |
+
"torch_seed": 1234,
|
| 94 |
+
"fewshot_seed": 1234
|
| 95 |
+
},
|
| 96 |
+
"git_hash": "e9f1740",
|
| 97 |
+
"date": 1764113938.7952223,
|
| 98 |
+
"pretty_env_info": "PyTorch version: 2.7.0+cu126\nIs debug build: False\nCUDA used to build PyTorch: 12.6\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.31.7\nLibc version: glibc-2.35\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1088-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H200\nGPU 1: NVIDIA H200\nGPU 2: NVIDIA H200\nGPU 3: NVIDIA H200\nGPU 4: NVIDIA H200\nGPU 5: NVIDIA H200\nGPU 6: NVIDIA H200\nGPU 7: NVIDIA H200\n\nNvidia driver version: 570.133.20\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8480C\nCPU family: 6\nModel: 143\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 8\nBogoMIPS: 4000.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 avx512vbmi umip waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid cldemote movdiri movdir64b fsrm serialize amx_bf16 avx512_fp16 amx_tile amx_int8 arch_capabilities\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 4.5 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 192 MiB (96 instances)\nL3 cache: 210 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47\nNUMA node1 CPU(s): 48-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Unknown: No mitigations\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Vulnerable\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Retpoline\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] flake8==7.0.0\n[pip3] flashinfer-python==0.2.5+cu126torch2.6\n[pip3] mypy==1.10.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] numpydoc==1.7.0\n[pip3] nvidia-cublas-cu12==12.6.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.6.80\n[pip3] nvidia-cuda-nvrtc-cu12==12.6.77\n[pip3] nvidia-cuda-runtime-cu12==12.6.77\n[pip3] nvidia-cudnn-cu12==9.5.1.17\n[pip3] nvidia-cufft-cu12==11.3.0.4\n[pip3] nvidia-curand-cu12==10.3.7.77\n[pip3] nvidia-cusolver-cu12==11.7.1.2\n[pip3] nvidia-cusparse-cu12==12.5.4.2\n[pip3] nvidia-cusparselt-cu12==0.6.3\n[pip3] nvidia-nccl-cu12==2.26.2\n[pip3] nvidia-nvjitlink-cu12==12.6.85\n[pip3] nvidia-nvtx-cu12==12.6.77\n[pip3] torch==2.7.0\n[pip3] torchaudio==2.7.0\n[pip3] torchvision==0.22.0\n[pip3] triton==3.3.0\n[conda] _anaconda_depends 2024.06 py312_mkl_2 \n[conda] blas 1.0 mkl \n[conda] flashinfer-python 0.2.5+cu126torch2.6 pypi_0 pypi\n[conda] mkl 2023.1.0 h213fc3f_46344 \n[conda] mkl-service 2.4.0 py312h5eee18b_1 \n[conda] mkl_fft 1.3.8 py312h5eee18b_0 \n[conda] mkl_random 1.2.4 py312hdb19cb5_0 \n[conda] numpy 1.26.4 py312hc5e2394_0 \n[conda] numpy-base 1.26.4 py312h0da6c21_0 \n[conda] numpydoc 1.7.0 py312h06a4308_0 \n[conda] nvidia-cublas-cu12 12.6.4.1 pypi_0 pypi\n[conda] nvidia-cuda-cupti-cu12 12.6.80 pypi_0 pypi\n[conda] nvidia-cuda-nvrtc-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cuda-runtime-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cudnn-cu12 9.5.1.17 pypi_0 pypi\n[conda] nvidia-cufft-cu12 11.3.0.4 pypi_0 pypi\n[conda] nvidia-curand-cu12 10.3.7.77 pypi_0 pypi\n[conda] nvidia-cusolver-cu12 11.7.1.2 pypi_0 pypi\n[conda] nvidia-cusparse-cu12 12.5.4.2 pypi_0 pypi\n[conda] nvidia-cusparselt-cu12 0.6.3 pypi_0 pypi\n[conda] nvidia-nccl-cu12 2.26.2 pypi_0 pypi\n[conda] nvidia-nvjitlink-cu12 12.6.85 pypi_0 pypi\n[conda] nvidia-nvtx-cu12 12.6.77 pypi_0 pypi\n[conda] torch 2.7.0 pypi_0 pypi\n[conda] torchaudio 2.7.0 pypi_0 pypi\n[conda] torchvision 0.22.0 pypi_0 pypi\n[conda] triton 3.3.0 pypi_0 pypi",
|
| 99 |
+
"transformers_version": "4.53.3",
|
| 100 |
+
"lm_eval_version": "0.4.9.1",
|
| 101 |
+
"upper_git_hash": null,
|
| 102 |
+
"tokenizer_pad_token": [
|
| 103 |
+
"<|end_of_text|>",
|
| 104 |
+
"1"
|
| 105 |
+
],
|
| 106 |
+
"tokenizer_eos_token": [
|
| 107 |
+
"<|end_of_text|>",
|
| 108 |
+
"1"
|
| 109 |
+
],
|
| 110 |
+
"tokenizer_bos_token": [
|
| 111 |
+
"<|begin_of_text|>",
|
| 112 |
+
"0"
|
| 113 |
+
],
|
| 114 |
+
"eot_token_id": 1,
|
| 115 |
+
"max_length": 32768,
|
| 116 |
+
"task_hashes": {
|
| 117 |
+
"gsm8k_reasoning_base": "727a4d754e27098dfdff5cd9b4a7bb1cfe61b92efda150fe0051c36b1c0a04ad"
|
| 118 |
+
},
|
| 119 |
+
"model_source": "vllm",
|
| 120 |
+
"model_name": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000/",
|
| 121 |
+
"model_name_sanitized": "__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__",
|
| 122 |
+
"system_instruction": null,
|
| 123 |
+
"system_instruction_sha": null,
|
| 124 |
+
"fewshot_as_multiturn": false,
|
| 125 |
+
"chat_template": null,
|
| 126 |
+
"chat_template_sha": null,
|
| 127 |
+
"start_time": 60649.515730168,
|
| 128 |
+
"end_time": 65471.732839258,
|
| 129 |
+
"total_evaluation_time_seconds": "4822.217109090001"
|
| 130 |
+
}
|
iter_1249000/eval_results/gsm8k_reasoning_base_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__/samples_gsm8k_reasoning_base_2025-11-26T00-59-11.177986.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:01727c3fade7f869b90320445fe1f222a4a8408b274ee31b4aee8f3c23d8b3a5
|
| 3 |
+
size 37855817
|
iter_1249000/eval_results/hellaswag_10shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T23-58-29.900269.json
ADDED
|
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"hellaswag": {
|
| 4 |
+
"alias": "hellaswag",
|
| 5 |
+
"acc,none": 0.6917944632543318,
|
| 6 |
+
"acc_stderr,none": 0.0046080828155357035,
|
| 7 |
+
"acc_norm,none": 0.8778131846245768,
|
| 8 |
+
"acc_norm_stderr,none": 0.003268321260913458
|
| 9 |
+
}
|
| 10 |
+
},
|
| 11 |
+
"group_subtasks": {
|
| 12 |
+
"hellaswag": []
|
| 13 |
+
},
|
| 14 |
+
"configs": {
|
| 15 |
+
"hellaswag": {
|
| 16 |
+
"task": "hellaswag",
|
| 17 |
+
"tag": [
|
| 18 |
+
"multiple_choice"
|
| 19 |
+
],
|
| 20 |
+
"dataset_path": "hellaswag",
|
| 21 |
+
"dataset_kwargs": {
|
| 22 |
+
"trust_remote_code": true
|
| 23 |
+
},
|
| 24 |
+
"training_split": "train",
|
| 25 |
+
"validation_split": "validation",
|
| 26 |
+
"process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n ctx = doc[\"ctx_a\"] + \" \" + doc[\"ctx_b\"].capitalize()\n out_doc = {\n \"query\": preprocess(doc[\"activity_label\"] + \": \" + ctx),\n \"choices\": [preprocess(ending) for ending in doc[\"endings\"]],\n \"gold\": int(doc[\"label\"]),\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
|
| 27 |
+
"doc_to_text": "{{query}}",
|
| 28 |
+
"doc_to_target": "{{label}}",
|
| 29 |
+
"unsafe_code": false,
|
| 30 |
+
"doc_to_choice": "choices",
|
| 31 |
+
"description": "",
|
| 32 |
+
"target_delimiter": " ",
|
| 33 |
+
"fewshot_delimiter": "\n\n",
|
| 34 |
+
"num_fewshot": 10,
|
| 35 |
+
"metric_list": [
|
| 36 |
+
{
|
| 37 |
+
"metric": "acc",
|
| 38 |
+
"aggregation": "mean",
|
| 39 |
+
"higher_is_better": true
|
| 40 |
+
},
|
| 41 |
+
{
|
| 42 |
+
"metric": "acc_norm",
|
| 43 |
+
"aggregation": "mean",
|
| 44 |
+
"higher_is_better": true
|
| 45 |
+
}
|
| 46 |
+
],
|
| 47 |
+
"output_type": "multiple_choice",
|
| 48 |
+
"repeats": 1,
|
| 49 |
+
"should_decontaminate": false,
|
| 50 |
+
"metadata": {
|
| 51 |
+
"version": 1.0,
|
| 52 |
+
"pretrained": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
|
| 53 |
+
"tensor_parallel_size": 8,
|
| 54 |
+
"dtype": "float32",
|
| 55 |
+
"gpu_memory_utilization": 0.8
|
| 56 |
+
}
|
| 57 |
+
}
|
| 58 |
+
},
|
| 59 |
+
"versions": {
|
| 60 |
+
"hellaswag": 1.0
|
| 61 |
+
},
|
| 62 |
+
"n-shot": {
|
| 63 |
+
"hellaswag": 10
|
| 64 |
+
},
|
| 65 |
+
"higher_is_better": {
|
| 66 |
+
"hellaswag": {
|
| 67 |
+
"acc": true,
|
| 68 |
+
"acc_norm": true
|
| 69 |
+
}
|
| 70 |
+
},
|
| 71 |
+
"n-samples": {
|
| 72 |
+
"hellaswag": {
|
| 73 |
+
"original": 10042,
|
| 74 |
+
"effective": 10042
|
| 75 |
+
}
|
| 76 |
+
},
|
| 77 |
+
"config": {
|
| 78 |
+
"model": "vllm",
|
| 79 |
+
"model_args": "pretrained=/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000,tensor_parallel_size=8,dtype=float32,gpu_memory_utilization=0.8",
|
| 80 |
+
"batch_size": "1",
|
| 81 |
+
"batch_sizes": [],
|
| 82 |
+
"device": null,
|
| 83 |
+
"use_cache": null,
|
| 84 |
+
"limit": null,
|
| 85 |
+
"bootstrap_iters": 100000,
|
| 86 |
+
"gen_kwargs": null,
|
| 87 |
+
"random_seed": 0,
|
| 88 |
+
"numpy_seed": 1234,
|
| 89 |
+
"torch_seed": 1234,
|
| 90 |
+
"fewshot_seed": 1234
|
| 91 |
+
},
|
| 92 |
+
"git_hash": "a445a07",
|
| 93 |
+
"date": 1752779716.3588188,
|
| 94 |
+
"pretty_env_info": "PyTorch version: 2.7.0+cu126\nIs debug build: False\nCUDA used to build PyTorch: 12.6\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.31.7\nLibc version: glibc-2.35\n\nPython version: 3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:12:24) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1088-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H200\nGPU 1: NVIDIA H200\nGPU 2: NVIDIA H200\nGPU 3: NVIDIA H200\nGPU 4: NVIDIA H200\nGPU 5: NVIDIA H200\nGPU 6: NVIDIA H200\nGPU 7: NVIDIA H200\n\nNvidia driver version: 570.133.20\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8480C\nCPU family: 6\nModel: 143\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 8\nBogoMIPS: 3999.99\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 avx512vbmi umip waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid cldemote movdiri movdir64b fsrm serialize amx_bf16 avx512_fp16 amx_tile amx_int8 arch_capabilities\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 4.5 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 192 MiB (96 instances)\nL3 cache: 210 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47\nNUMA node1 CPU(s): 48-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Unknown: No mitigations\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Vulnerable\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Retpoline\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] flake8==7.0.0\n[pip3] flashinfer-python==0.2.5+cu126torch2.6\n[pip3] mypy==1.10.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] numpydoc==1.7.0\n[pip3] nvidia-cublas-cu12==12.6.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.6.80\n[pip3] nvidia-cuda-nvrtc-cu12==12.6.77\n[pip3] nvidia-cuda-runtime-cu12==12.6.77\n[pip3] nvidia-cudnn-cu12==9.5.1.17\n[pip3] nvidia-cufft-cu12==11.3.0.4\n[pip3] nvidia-curand-cu12==10.3.7.77\n[pip3] nvidia-cusolver-cu12==11.7.1.2\n[pip3] nvidia-cusparse-cu12==12.5.4.2\n[pip3] nvidia-cusparselt-cu12==0.6.3\n[pip3] nvidia-nccl-cu12==2.26.2\n[pip3] nvidia-nvjitlink-cu12==12.6.85\n[pip3] nvidia-nvtx-cu12==12.6.77\n[pip3] torch==2.7.0\n[pip3] torchaudio==2.7.0\n[pip3] torchvision==0.22.0\n[pip3] triton==3.3.0\n[conda] _anaconda_depends 2024.06 py312_mkl_2 \n[conda] blas 1.0 mkl \n[conda] flashinfer-python 0.2.5+cu126torch2.6 pypi_0 pypi\n[conda] mkl 2023.1.0 h213fc3f_46344 \n[conda] mkl-service 2.4.0 py312h5eee18b_1 \n[conda] mkl_fft 1.3.8 py312h5eee18b_0 \n[conda] mkl_random 1.2.4 py312hdb19cb5_0 \n[conda] numpy 1.26.4 py312hc5e2394_0 \n[conda] numpy-base 1.26.4 py312h0da6c21_0 \n[conda] numpydoc 1.7.0 py312h06a4308_0 \n[conda] nvidia-cublas-cu12 12.6.4.1 pypi_0 pypi\n[conda] nvidia-cuda-cupti-cu12 12.6.80 pypi_0 pypi\n[conda] nvidia-cuda-nvrtc-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cuda-runtime-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cudnn-cu12 9.5.1.17 pypi_0 pypi\n[conda] nvidia-cufft-cu12 11.3.0.4 pypi_0 pypi\n[conda] nvidia-curand-cu12 10.3.7.77 pypi_0 pypi\n[conda] nvidia-cusolver-cu12 11.7.1.2 pypi_0 pypi\n[conda] nvidia-cusparse-cu12 12.5.4.2 pypi_0 pypi\n[conda] nvidia-cusparselt-cu12 0.6.3 pypi_0 pypi\n[conda] nvidia-nccl-cu12 2.26.2 pypi_0 pypi\n[conda] nvidia-nvjitlink-cu12 12.6.85 pypi_0 pypi\n[conda] nvidia-nvtx-cu12 12.6.77 pypi_0 pypi\n[conda] torch 2.7.0 pypi_0 pypi\n[conda] torchaudio 2.7.0 pypi_0 pypi\n[conda] torchvision 0.22.0 pypi_0 pypi\n[conda] triton 3.3.0 pypi_0 pypi",
|
| 95 |
+
"transformers_version": "4.53.0.dev0",
|
| 96 |
+
"lm_eval_version": "0.4.8",
|
| 97 |
+
"upper_git_hash": null,
|
| 98 |
+
"tokenizer_pad_token": [
|
| 99 |
+
"<|end_of_text|>",
|
| 100 |
+
"1"
|
| 101 |
+
],
|
| 102 |
+
"tokenizer_eos_token": [
|
| 103 |
+
"<|end_of_text|>",
|
| 104 |
+
"1"
|
| 105 |
+
],
|
| 106 |
+
"tokenizer_bos_token": [
|
| 107 |
+
"<|begin_of_text|>",
|
| 108 |
+
"0"
|
| 109 |
+
],
|
| 110 |
+
"eot_token_id": 1,
|
| 111 |
+
"max_length": 32768,
|
| 112 |
+
"task_hashes": {
|
| 113 |
+
"hellaswag": "d4bcb44ec68db2b8a65f050c3c64c48454179b48fd8aee3e73b55e2ec51e6d82"
|
| 114 |
+
},
|
| 115 |
+
"model_source": "vllm",
|
| 116 |
+
"model_name": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
|
| 117 |
+
"model_name_sanitized": "__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000",
|
| 118 |
+
"system_instruction": null,
|
| 119 |
+
"system_instruction_sha": null,
|
| 120 |
+
"fewshot_as_multiturn": false,
|
| 121 |
+
"chat_template": null,
|
| 122 |
+
"chat_template_sha": null,
|
| 123 |
+
"start_time": 1470954.839314792,
|
| 124 |
+
"end_time": 1487967.681337241,
|
| 125 |
+
"total_evaluation_time_seconds": "17012.84202244901"
|
| 126 |
+
}
|
iter_1249000/eval_results/hellaswag_10shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_hellaswag_2025-07-17T23-58-29.900269.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:c9d41f492759ccbaa512715014c7f9fedbf37f04ce4d579ab84191b90ea7a812
|
| 3 |
+
size 187317968
|
iter_1249000/eval_results/humaneval_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T19-26-52.816826.json
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"humaneval": {
|
| 4 |
+
"alias": "humaneval",
|
| 5 |
+
"pass@1,create_test": 0.5,
|
| 6 |
+
"pass@1_stderr,create_test": 0.03916302249939787
|
| 7 |
+
}
|
| 8 |
+
},
|
| 9 |
+
"group_subtasks": {
|
| 10 |
+
"humaneval": []
|
| 11 |
+
},
|
| 12 |
+
"configs": {
|
| 13 |
+
"humaneval": {
|
| 14 |
+
"task": "humaneval",
|
| 15 |
+
"dataset_path": "openai/openai_humaneval",
|
| 16 |
+
"test_split": "test",
|
| 17 |
+
"doc_to_text": "{{prompt}}",
|
| 18 |
+
"doc_to_target": "{{test}}\ncheck({{entry_point}})",
|
| 19 |
+
"unsafe_code": true,
|
| 20 |
+
"description": "",
|
| 21 |
+
"target_delimiter": " ",
|
| 22 |
+
"fewshot_delimiter": "\n\n",
|
| 23 |
+
"num_fewshot": 0,
|
| 24 |
+
"metric_list": [
|
| 25 |
+
{
|
| 26 |
+
"metric": "def pass_at_k(references: list[str], predictions: list[list[str]], k: list[int] = None):\n global compute_\n assert k is not None\n if isinstance(k, int):\n k = [k]\n res = compute_.compute(\n references=references,\n predictions=predictions,\n k=k,\n )\n return res[0]\n",
|
| 27 |
+
"aggregation": "mean",
|
| 28 |
+
"higher_is_better": true,
|
| 29 |
+
"k": [
|
| 30 |
+
1
|
| 31 |
+
]
|
| 32 |
+
}
|
| 33 |
+
],
|
| 34 |
+
"output_type": "generate_until",
|
| 35 |
+
"generation_kwargs": {
|
| 36 |
+
"until": [
|
| 37 |
+
"\nclass",
|
| 38 |
+
"\ndef",
|
| 39 |
+
"\n#",
|
| 40 |
+
"\nif",
|
| 41 |
+
"\nprint"
|
| 42 |
+
],
|
| 43 |
+
"max_gen_toks": 1024,
|
| 44 |
+
"do_sample": false
|
| 45 |
+
},
|
| 46 |
+
"repeats": 1,
|
| 47 |
+
"filter_list": [
|
| 48 |
+
{
|
| 49 |
+
"name": "create_test",
|
| 50 |
+
"filter": [
|
| 51 |
+
{
|
| 52 |
+
"function": "custom",
|
| 53 |
+
"filter_fn": "<function build_predictions at 0x150e2169b6a0>"
|
| 54 |
+
}
|
| 55 |
+
]
|
| 56 |
+
}
|
| 57 |
+
],
|
| 58 |
+
"should_decontaminate": false,
|
| 59 |
+
"metadata": {
|
| 60 |
+
"version": 1.0,
|
| 61 |
+
"pretrained": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
|
| 62 |
+
"tensor_parallel_size": 8,
|
| 63 |
+
"dtype": "float32",
|
| 64 |
+
"gpu_memory_utilization": 0.8
|
| 65 |
+
}
|
| 66 |
+
}
|
| 67 |
+
},
|
| 68 |
+
"versions": {
|
| 69 |
+
"humaneval": 1.0
|
| 70 |
+
},
|
| 71 |
+
"n-shot": {
|
| 72 |
+
"humaneval": 0
|
| 73 |
+
},
|
| 74 |
+
"higher_is_better": {
|
| 75 |
+
"humaneval": {
|
| 76 |
+
"pass_at_k": true
|
| 77 |
+
}
|
| 78 |
+
},
|
| 79 |
+
"n-samples": {
|
| 80 |
+
"humaneval": {
|
| 81 |
+
"original": 164,
|
| 82 |
+
"effective": 164
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"config": {
|
| 86 |
+
"model": "vllm",
|
| 87 |
+
"model_args": "pretrained=/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000,tensor_parallel_size=8,dtype=float32,gpu_memory_utilization=0.8",
|
| 88 |
+
"batch_size": "1",
|
| 89 |
+
"batch_sizes": [],
|
| 90 |
+
"device": null,
|
| 91 |
+
"use_cache": null,
|
| 92 |
+
"limit": null,
|
| 93 |
+
"bootstrap_iters": 100000,
|
| 94 |
+
"gen_kwargs": null,
|
| 95 |
+
"random_seed": 0,
|
| 96 |
+
"numpy_seed": 1234,
|
| 97 |
+
"torch_seed": 1234,
|
| 98 |
+
"fewshot_seed": 1234
|
| 99 |
+
},
|
| 100 |
+
"git_hash": "a445a07",
|
| 101 |
+
"date": 1752779715.3442433,
|
| 102 |
+
"pretty_env_info": "PyTorch version: 2.7.0+cu126\nIs debug build: False\nCUDA used to build PyTorch: 12.6\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.31.7\nLibc version: glibc-2.35\n\nPython version: 3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:12:24) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1088-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H200\nGPU 1: NVIDIA H200\nGPU 2: NVIDIA H200\nGPU 3: NVIDIA H200\nGPU 4: NVIDIA H200\nGPU 5: NVIDIA H200\nGPU 6: NVIDIA H200\nGPU 7: NVIDIA H200\n\nNvidia driver version: 570.133.20\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8480C\nCPU family: 6\nModel: 143\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 8\nBogoMIPS: 4000.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 avx512vbmi umip waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid cldemote movdiri movdir64b fsrm serialize amx_bf16 avx512_fp16 amx_tile amx_int8 arch_capabilities\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 4.5 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 192 MiB (96 instances)\nL3 cache: 210 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47\nNUMA node1 CPU(s): 48-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Unknown: No mitigations\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Vulnerable\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Retpoline\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] flake8==7.0.0\n[pip3] flashinfer-python==0.2.5+cu126torch2.6\n[pip3] mypy==1.10.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] numpydoc==1.7.0\n[pip3] nvidia-cublas-cu12==12.6.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.6.80\n[pip3] nvidia-cuda-nvrtc-cu12==12.6.77\n[pip3] nvidia-cuda-runtime-cu12==12.6.77\n[pip3] nvidia-cudnn-cu12==9.5.1.17\n[pip3] nvidia-cufft-cu12==11.3.0.4\n[pip3] nvidia-curand-cu12==10.3.7.77\n[pip3] nvidia-cusolver-cu12==11.7.1.2\n[pip3] nvidia-cusparse-cu12==12.5.4.2\n[pip3] nvidia-cusparselt-cu12==0.6.3\n[pip3] nvidia-nccl-cu12==2.26.2\n[pip3] nvidia-nvjitlink-cu12==12.6.85\n[pip3] nvidia-nvtx-cu12==12.6.77\n[pip3] torch==2.7.0\n[pip3] torchaudio==2.7.0\n[pip3] torchvision==0.22.0\n[pip3] triton==3.3.0\n[conda] _anaconda_depends 2024.06 py312_mkl_2 \n[conda] blas 1.0 mkl \n[conda] flashinfer-python 0.2.5+cu126torch2.6 pypi_0 pypi\n[conda] mkl 2023.1.0 h213fc3f_46344 \n[conda] mkl-service 2.4.0 py312h5eee18b_1 \n[conda] mkl_fft 1.3.8 py312h5eee18b_0 \n[conda] mkl_random 1.2.4 py312hdb19cb5_0 \n[conda] numpy 1.26.4 py312hc5e2394_0 \n[conda] numpy-base 1.26.4 py312h0da6c21_0 \n[conda] numpydoc 1.7.0 py312h06a4308_0 \n[conda] nvidia-cublas-cu12 12.6.4.1 pypi_0 pypi\n[conda] nvidia-cuda-cupti-cu12 12.6.80 pypi_0 pypi\n[conda] nvidia-cuda-nvrtc-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cuda-runtime-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cudnn-cu12 9.5.1.17 pypi_0 pypi\n[conda] nvidia-cufft-cu12 11.3.0.4 pypi_0 pypi\n[conda] nvidia-curand-cu12 10.3.7.77 pypi_0 pypi\n[conda] nvidia-cusolver-cu12 11.7.1.2 pypi_0 pypi\n[conda] nvidia-cusparse-cu12 12.5.4.2 pypi_0 pypi\n[conda] nvidia-cusparselt-cu12 0.6.3 pypi_0 pypi\n[conda] nvidia-nccl-cu12 2.26.2 pypi_0 pypi\n[conda] nvidia-nvjitlink-cu12 12.6.85 pypi_0 pypi\n[conda] nvidia-nvtx-cu12 12.6.77 pypi_0 pypi\n[conda] torch 2.7.0 pypi_0 pypi\n[conda] torchaudio 2.7.0 pypi_0 pypi\n[conda] torchvision 0.22.0 pypi_0 pypi\n[conda] triton 3.3.0 pypi_0 pypi",
|
| 103 |
+
"transformers_version": "4.53.0.dev0",
|
| 104 |
+
"lm_eval_version": "0.4.8",
|
| 105 |
+
"upper_git_hash": null,
|
| 106 |
+
"tokenizer_pad_token": [
|
| 107 |
+
"<|end_of_text|>",
|
| 108 |
+
"1"
|
| 109 |
+
],
|
| 110 |
+
"tokenizer_eos_token": [
|
| 111 |
+
"<|end_of_text|>",
|
| 112 |
+
"1"
|
| 113 |
+
],
|
| 114 |
+
"tokenizer_bos_token": [
|
| 115 |
+
"<|begin_of_text|>",
|
| 116 |
+
"0"
|
| 117 |
+
],
|
| 118 |
+
"eot_token_id": 1,
|
| 119 |
+
"max_length": 32768,
|
| 120 |
+
"task_hashes": {
|
| 121 |
+
"humaneval": "c122632f0bdffdd0162cc8033a879dfe9d3b35e82c52eda6a767069e0a2beb40"
|
| 122 |
+
},
|
| 123 |
+
"model_source": "vllm",
|
| 124 |
+
"model_name": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
|
| 125 |
+
"model_name_sanitized": "__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000",
|
| 126 |
+
"system_instruction": null,
|
| 127 |
+
"system_instruction_sha": null,
|
| 128 |
+
"fewshot_as_multiturn": false,
|
| 129 |
+
"chat_template": null,
|
| 130 |
+
"chat_template_sha": null,
|
| 131 |
+
"start_time": 1470959.94938541,
|
| 132 |
+
"end_time": 1471675.70521845,
|
| 133 |
+
"total_evaluation_time_seconds": "715.7558330399916"
|
| 134 |
+
}
|
iter_1249000/eval_results/humaneval_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_humaneval_2025-07-17T19-26-52.816826.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/humaneval_64_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-08-22T03-51-05.521361.json
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"humaneval_64": {
|
| 4 |
+
"alias": "humaneval_64",
|
| 5 |
+
"pass@2,create_test": 0.5489377661633754,
|
| 6 |
+
"pass@2_stderr,create_test": 0.0340805230997571,
|
| 7 |
+
"pass@8,create_test": 0.6579873034944809,
|
| 8 |
+
"pass@8_stderr,create_test": 0.0337464509487294,
|
| 9 |
+
"pass@16,create_test": 0.7013816146155691,
|
| 10 |
+
"pass@16_stderr,create_test": 0.03303964263380142,
|
| 11 |
+
"pass@32,create_test": 0.7398964514096209,
|
| 12 |
+
"pass@32_stderr,create_test": 0.03268103770634371,
|
| 13 |
+
"pass@64,create_test": 0.7682926829268293,
|
| 14 |
+
"pass@64_stderr,create_test": 0.033047561588107836
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"group_subtasks": {
|
| 18 |
+
"humaneval_64": []
|
| 19 |
+
},
|
| 20 |
+
"configs": {
|
| 21 |
+
"humaneval_64": {
|
| 22 |
+
"task": "humaneval_64",
|
| 23 |
+
"dataset_path": "openai/openai_humaneval",
|
| 24 |
+
"test_split": "test",
|
| 25 |
+
"doc_to_text": "{{prompt}}",
|
| 26 |
+
"doc_to_target": "{{test}}\ncheck({{entry_point}})",
|
| 27 |
+
"unsafe_code": true,
|
| 28 |
+
"description": "",
|
| 29 |
+
"target_delimiter": " ",
|
| 30 |
+
"fewshot_delimiter": "\n\n",
|
| 31 |
+
"num_fewshot": 0,
|
| 32 |
+
"metric_list": [
|
| 33 |
+
{
|
| 34 |
+
"metric": "def pass_at_k(references: list[str], predictions: list[list[str]], k: list[int] = None):\n global compute_\n assert k is not None\n if isinstance(k, int):\n k = [k]\n res = compute_.compute(\n references=references,\n predictions=predictions,\n k=k,\n )\n return res[0]\n",
|
| 35 |
+
"aggregation": "mean",
|
| 36 |
+
"higher_is_better": true,
|
| 37 |
+
"k": [
|
| 38 |
+
2,
|
| 39 |
+
8,
|
| 40 |
+
16,
|
| 41 |
+
32,
|
| 42 |
+
64
|
| 43 |
+
]
|
| 44 |
+
}
|
| 45 |
+
],
|
| 46 |
+
"output_type": "generate_until",
|
| 47 |
+
"generation_kwargs": {
|
| 48 |
+
"until": [
|
| 49 |
+
"\nclass",
|
| 50 |
+
"\ndef",
|
| 51 |
+
"\n#",
|
| 52 |
+
"\nif",
|
| 53 |
+
"\nprint"
|
| 54 |
+
],
|
| 55 |
+
"max_gen_toks": 1024,
|
| 56 |
+
"do_sample": true,
|
| 57 |
+
"temperature": 0.2,
|
| 58 |
+
"top_p": 0.95
|
| 59 |
+
},
|
| 60 |
+
"repeats": 64,
|
| 61 |
+
"filter_list": [
|
| 62 |
+
{
|
| 63 |
+
"name": "create_test",
|
| 64 |
+
"filter": [
|
| 65 |
+
{
|
| 66 |
+
"function": "custom",
|
| 67 |
+
"filter_fn": "<function build_predictions at 0x14f666525080>"
|
| 68 |
+
}
|
| 69 |
+
]
|
| 70 |
+
}
|
| 71 |
+
],
|
| 72 |
+
"should_decontaminate": false,
|
| 73 |
+
"metadata": {
|
| 74 |
+
"version": 1.0,
|
| 75 |
+
"pretrained": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
|
| 76 |
+
"tensor_parallel_size": 8,
|
| 77 |
+
"dtype": "float32",
|
| 78 |
+
"gpu_memory_utilization": 0.9
|
| 79 |
+
}
|
| 80 |
+
}
|
| 81 |
+
},
|
| 82 |
+
"versions": {
|
| 83 |
+
"humaneval_64": 1.0
|
| 84 |
+
},
|
| 85 |
+
"n-shot": {
|
| 86 |
+
"humaneval_64": 0
|
| 87 |
+
},
|
| 88 |
+
"higher_is_better": {
|
| 89 |
+
"humaneval_64": {
|
| 90 |
+
"pass_at_k": true
|
| 91 |
+
}
|
| 92 |
+
},
|
| 93 |
+
"n-samples": {
|
| 94 |
+
"humaneval_64": {
|
| 95 |
+
"original": 164,
|
| 96 |
+
"effective": 164
|
| 97 |
+
}
|
| 98 |
+
},
|
| 99 |
+
"config": {
|
| 100 |
+
"model": "vllm",
|
| 101 |
+
"model_args": "pretrained=/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000,tensor_parallel_size=8,dtype=float32,gpu_memory_utilization=0.9",
|
| 102 |
+
"batch_size": "auto",
|
| 103 |
+
"batch_sizes": [],
|
| 104 |
+
"device": null,
|
| 105 |
+
"use_cache": null,
|
| 106 |
+
"limit": null,
|
| 107 |
+
"bootstrap_iters": 100000,
|
| 108 |
+
"gen_kwargs": null,
|
| 109 |
+
"random_seed": 0,
|
| 110 |
+
"numpy_seed": 1234,
|
| 111 |
+
"torch_seed": 1234,
|
| 112 |
+
"fewshot_seed": 1234
|
| 113 |
+
},
|
| 114 |
+
"git_hash": "bc23ea7",
|
| 115 |
+
"date": 1755832330.597868,
|
| 116 |
+
"pretty_env_info": "PyTorch version: 2.7.0+cu126\nIs debug build: False\nCUDA used to build PyTorch: 12.6\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.31.7\nLibc version: glibc-2.35\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1088-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H200\nGPU 1: NVIDIA H200\nGPU 2: NVIDIA H200\nGPU 3: NVIDIA H200\nGPU 4: NVIDIA H200\nGPU 5: NVIDIA H200\nGPU 6: NVIDIA H200\nGPU 7: NVIDIA H200\n\nNvidia driver version: 570.133.20\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8480C\nCPU family: 6\nModel: 143\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 8\nBogoMIPS: 3999.99\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 avx512vbmi umip waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid cldemote movdiri movdir64b fsrm serialize amx_bf16 avx512_fp16 amx_tile amx_int8 arch_capabilities\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 4.5 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 192 MiB (96 instances)\nL3 cache: 210 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47\nNUMA node1 CPU(s): 48-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Unknown: No mitigations\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Vulnerable\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Retpoline\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] flake8==7.0.0\n[pip3] flashinfer-python==0.2.5+cu126torch2.6\n[pip3] mypy==1.10.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] numpydoc==1.7.0\n[pip3] nvidia-cublas-cu12==12.6.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.6.80\n[pip3] nvidia-cuda-nvrtc-cu12==12.6.77\n[pip3] nvidia-cuda-runtime-cu12==12.6.77\n[pip3] nvidia-cudnn-cu12==9.5.1.17\n[pip3] nvidia-cufft-cu12==11.3.0.4\n[pip3] nvidia-curand-cu12==10.3.7.77\n[pip3] nvidia-cusolver-cu12==11.7.1.2\n[pip3] nvidia-cusparse-cu12==12.5.4.2\n[pip3] nvidia-cusparselt-cu12==0.6.3\n[pip3] nvidia-nccl-cu12==2.26.2\n[pip3] nvidia-nvjitlink-cu12==12.6.85\n[pip3] nvidia-nvtx-cu12==12.6.77\n[pip3] torch==2.7.0\n[pip3] torchaudio==2.7.0\n[pip3] torchvision==0.22.0\n[pip3] triton==3.3.0\n[conda] numpy 1.26.4 pypi_0 pypi\n[conda] nvidia-cublas-cu12 12.1.3.1 pypi_0 pypi\n[conda] nvidia-cuda-cupti-cu12 12.1.105 pypi_0 pypi\n[conda] nvidia-cuda-nvrtc-cu12 12.1.105 pypi_0 pypi\n[conda] nvidia-cuda-runtime-cu12 12.1.105 pypi_0 pypi\n[conda] nvidia-cudnn-cu12 9.1.0.70 pypi_0 pypi\n[conda] nvidia-cufft-cu12 11.0.2.54 pypi_0 pypi\n[conda] nvidia-curand-cu12 10.3.2.106 pypi_0 pypi\n[conda] nvidia-cusolver-cu12 11.4.5.107 pypi_0 pypi\n[conda] nvidia-cusparse-cu12 12.1.0.106 pypi_0 pypi\n[conda] nvidia-nccl-cu12 2.20.5 pypi_0 pypi\n[conda] nvidia-nvjitlink-cu12 12.9.86 pypi_0 pypi\n[conda] nvidia-nvtx-cu12 12.1.105 pypi_0 pypi\n[conda] torch 2.4.1 pypi_0 pypi\n[conda] triton 3.0.0 pypi_0 pypi",
|
| 117 |
+
"transformers_version": "4.53.0.dev0",
|
| 118 |
+
"lm_eval_version": "0.4.9.1",
|
| 119 |
+
"upper_git_hash": null,
|
| 120 |
+
"tokenizer_pad_token": [
|
| 121 |
+
"<|end_of_text|>",
|
| 122 |
+
"1"
|
| 123 |
+
],
|
| 124 |
+
"tokenizer_eos_token": [
|
| 125 |
+
"<|end_of_text|>",
|
| 126 |
+
"1"
|
| 127 |
+
],
|
| 128 |
+
"tokenizer_bos_token": [
|
| 129 |
+
"<|begin_of_text|>",
|
| 130 |
+
"0"
|
| 131 |
+
],
|
| 132 |
+
"eot_token_id": 1,
|
| 133 |
+
"max_length": 32768,
|
| 134 |
+
"task_hashes": {
|
| 135 |
+
"humaneval_64": "c122632f0bdffdd0162cc8033a879dfe9d3b35e82c52eda6a767069e0a2beb40"
|
| 136 |
+
},
|
| 137 |
+
"model_source": "vllm",
|
| 138 |
+
"model_name": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
|
| 139 |
+
"model_name_sanitized": "__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000",
|
| 140 |
+
"system_instruction": null,
|
| 141 |
+
"system_instruction_sha": null,
|
| 142 |
+
"fewshot_as_multiturn": false,
|
| 143 |
+
"chat_template": null,
|
| 144 |
+
"chat_template_sha": null,
|
| 145 |
+
"start_time": 69255.504785092,
|
| 146 |
+
"end_time": 71603.678026169,
|
| 147 |
+
"total_evaluation_time_seconds": "2348.1732410770055"
|
| 148 |
+
}
|
iter_1249000/eval_results/humaneval_64_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_humaneval_64_2025-08-22T03-51-05.521361.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:e0648dffef479001422c9be1420555df168a9063f0c2910915904f2ada4e8cdd
|
| 3 |
+
size 13131722
|
iter_1249000/eval_results/humaneval_64_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__/results_2025-11-25T01-35-23.860829.json
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"humaneval_64": {
|
| 4 |
+
"alias": "humaneval_64",
|
| 5 |
+
"pass@2,create_test": 0.5490285036778934,
|
| 6 |
+
"pass@2_stderr,create_test": 0.0340842438280205,
|
| 7 |
+
"pass@8,create_test": 0.6579901080473166,
|
| 8 |
+
"pass@8_stderr,create_test": 0.03374662453348631,
|
| 9 |
+
"pass@16,create_test": 0.7013816165516722,
|
| 10 |
+
"pass@16_stderr,create_test": 0.03303964274115624,
|
| 11 |
+
"pass@32,create_test": 0.7398964514096209,
|
| 12 |
+
"pass@32_stderr,create_test": 0.03268103770634371,
|
| 13 |
+
"pass@64,create_test": 0.7682926829268293,
|
| 14 |
+
"pass@64_stderr,create_test": 0.033047561588107836
|
| 15 |
+
}
|
| 16 |
+
},
|
| 17 |
+
"group_subtasks": {
|
| 18 |
+
"humaneval_64": []
|
| 19 |
+
},
|
| 20 |
+
"configs": {
|
| 21 |
+
"humaneval_64": {
|
| 22 |
+
"task": "humaneval_64",
|
| 23 |
+
"dataset_path": "openai/openai_humaneval",
|
| 24 |
+
"test_split": "test",
|
| 25 |
+
"doc_to_text": "{{prompt}}",
|
| 26 |
+
"doc_to_target": "{{test}}\ncheck({{entry_point}})",
|
| 27 |
+
"unsafe_code": true,
|
| 28 |
+
"description": "",
|
| 29 |
+
"target_delimiter": " ",
|
| 30 |
+
"fewshot_delimiter": "\n\n",
|
| 31 |
+
"num_fewshot": 0,
|
| 32 |
+
"metric_list": [
|
| 33 |
+
{
|
| 34 |
+
"metric": "def pass_at_k(references: list[str], predictions: list[list[str]], k: list[int] = None):\n global compute_\n assert k is not None\n if isinstance(k, int):\n k = [k]\n res = compute_.compute(\n references=references,\n predictions=predictions,\n k=k,\n )\n return res[0]\n",
|
| 35 |
+
"aggregation": "mean",
|
| 36 |
+
"higher_is_better": true,
|
| 37 |
+
"k": [
|
| 38 |
+
2,
|
| 39 |
+
8,
|
| 40 |
+
16,
|
| 41 |
+
32,
|
| 42 |
+
64
|
| 43 |
+
]
|
| 44 |
+
}
|
| 45 |
+
],
|
| 46 |
+
"output_type": "generate_until",
|
| 47 |
+
"generation_kwargs": {
|
| 48 |
+
"until": [
|
| 49 |
+
"\nclass",
|
| 50 |
+
"\ndef",
|
| 51 |
+
"\n#",
|
| 52 |
+
"\nif",
|
| 53 |
+
"\nprint"
|
| 54 |
+
],
|
| 55 |
+
"max_gen_toks": 1024,
|
| 56 |
+
"do_sample": true,
|
| 57 |
+
"temperature": 0.2,
|
| 58 |
+
"top_p": 0.95
|
| 59 |
+
},
|
| 60 |
+
"repeats": 64,
|
| 61 |
+
"filter_list": [
|
| 62 |
+
{
|
| 63 |
+
"name": "create_test",
|
| 64 |
+
"filter": [
|
| 65 |
+
{
|
| 66 |
+
"function": "custom",
|
| 67 |
+
"filter_fn": "<function build_predictions at 0x1518b1ea5d00>"
|
| 68 |
+
}
|
| 69 |
+
]
|
| 70 |
+
}
|
| 71 |
+
],
|
| 72 |
+
"should_decontaminate": false,
|
| 73 |
+
"metadata": {
|
| 74 |
+
"version": 1.0,
|
| 75 |
+
"pretrained": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000/",
|
| 76 |
+
"tensor_parallel_size": 8,
|
| 77 |
+
"dtype": "float32",
|
| 78 |
+
"gpu_memory_utilization": 0.9
|
| 79 |
+
}
|
| 80 |
+
}
|
| 81 |
+
},
|
| 82 |
+
"versions": {
|
| 83 |
+
"humaneval_64": 1.0
|
| 84 |
+
},
|
| 85 |
+
"n-shot": {
|
| 86 |
+
"humaneval_64": 0
|
| 87 |
+
},
|
| 88 |
+
"higher_is_better": {
|
| 89 |
+
"humaneval_64": {
|
| 90 |
+
"pass_at_k": true
|
| 91 |
+
}
|
| 92 |
+
},
|
| 93 |
+
"n-samples": {
|
| 94 |
+
"humaneval_64": {
|
| 95 |
+
"original": 164,
|
| 96 |
+
"effective": 164
|
| 97 |
+
}
|
| 98 |
+
},
|
| 99 |
+
"config": {
|
| 100 |
+
"model": "vllm",
|
| 101 |
+
"model_args": "pretrained=/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000/,tensor_parallel_size=8,dtype=float32,gpu_memory_utilization=0.9",
|
| 102 |
+
"batch_size": "auto",
|
| 103 |
+
"batch_sizes": [],
|
| 104 |
+
"device": null,
|
| 105 |
+
"use_cache": null,
|
| 106 |
+
"limit": null,
|
| 107 |
+
"bootstrap_iters": 100000,
|
| 108 |
+
"gen_kwargs": null,
|
| 109 |
+
"random_seed": 0,
|
| 110 |
+
"numpy_seed": 1234,
|
| 111 |
+
"torch_seed": 1234,
|
| 112 |
+
"fewshot_seed": 1234
|
| 113 |
+
},
|
| 114 |
+
"git_hash": "e9f1740",
|
| 115 |
+
"date": 1764032186.8127863,
|
| 116 |
+
"pretty_env_info": "PyTorch version: 2.7.0+cu126\nIs debug build: False\nCUDA used to build PyTorch: 12.6\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.31.7\nLibc version: glibc-2.35\n\nPython version: 3.12.2 | packaged by conda-forge | (main, Feb 16 2024, 20:50:58) [GCC 12.3.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1088-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H200\nGPU 1: NVIDIA H200\nGPU 2: NVIDIA H200\nGPU 3: NVIDIA H200\nGPU 4: NVIDIA H200\nGPU 5: NVIDIA H200\nGPU 6: NVIDIA H200\nGPU 7: NVIDIA H200\n\nNvidia driver version: 570.133.20\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8480C\nCPU family: 6\nModel: 143\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 8\nBogoMIPS: 3999.99\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 avx512vbmi umip waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid cldemote movdiri movdir64b fsrm serialize amx_bf16 avx512_fp16 amx_tile amx_int8 arch_capabilities\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 4.5 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 192 MiB (96 instances)\nL3 cache: 210 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47\nNUMA node1 CPU(s): 48-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Unknown: No mitigations\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Vulnerable\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Retpoline\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] flake8==7.0.0\n[pip3] flashinfer-python==0.2.5+cu126torch2.6\n[pip3] mypy==1.10.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] numpydoc==1.7.0\n[pip3] nvidia-cublas-cu12==12.6.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.6.80\n[pip3] nvidia-cuda-nvrtc-cu12==12.6.77\n[pip3] nvidia-cuda-runtime-cu12==12.6.77\n[pip3] nvidia-cudnn-cu12==9.5.1.17\n[pip3] nvidia-cufft-cu12==11.3.0.4\n[pip3] nvidia-curand-cu12==10.3.7.77\n[pip3] nvidia-cusolver-cu12==11.7.1.2\n[pip3] nvidia-cusparse-cu12==12.5.4.2\n[pip3] nvidia-cusparselt-cu12==0.6.3\n[pip3] nvidia-nccl-cu12==2.26.2\n[pip3] nvidia-nvjitlink-cu12==12.6.85\n[pip3] nvidia-nvtx-cu12==12.6.77\n[pip3] torch==2.7.0\n[pip3] torchaudio==2.7.0\n[pip3] torchvision==0.22.0\n[pip3] triton==3.3.0\n[conda] _anaconda_depends 2024.06 py312_mkl_2 \n[conda] blas 1.0 mkl \n[conda] flashinfer-python 0.2.5+cu126torch2.6 pypi_0 pypi\n[conda] mkl 2023.1.0 h213fc3f_46344 \n[conda] mkl-service 2.4.0 py312h5eee18b_1 \n[conda] mkl_fft 1.3.8 py312h5eee18b_0 \n[conda] mkl_random 1.2.4 py312hdb19cb5_0 \n[conda] numpy 1.26.4 py312hc5e2394_0 \n[conda] numpy-base 1.26.4 py312h0da6c21_0 \n[conda] numpydoc 1.7.0 py312h06a4308_0 \n[conda] nvidia-cublas-cu12 12.6.4.1 pypi_0 pypi\n[conda] nvidia-cuda-cupti-cu12 12.6.80 pypi_0 pypi\n[conda] nvidia-cuda-nvrtc-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cuda-runtime-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cudnn-cu12 9.5.1.17 pypi_0 pypi\n[conda] nvidia-cufft-cu12 11.3.0.4 pypi_0 pypi\n[conda] nvidia-curand-cu12 10.3.7.77 pypi_0 pypi\n[conda] nvidia-cusolver-cu12 11.7.1.2 pypi_0 pypi\n[conda] nvidia-cusparse-cu12 12.5.4.2 pypi_0 pypi\n[conda] nvidia-cusparselt-cu12 0.6.3 pypi_0 pypi\n[conda] nvidia-nccl-cu12 2.26.2 pypi_0 pypi\n[conda] nvidia-nvjitlink-cu12 12.6.85 pypi_0 pypi\n[conda] nvidia-nvtx-cu12 12.6.77 pypi_0 pypi\n[conda] torch 2.7.0 pypi_0 pypi\n[conda] torchaudio 2.7.0 pypi_0 pypi\n[conda] torchvision 0.22.0 pypi_0 pypi\n[conda] triton 3.3.0 pypi_0 pypi",
|
| 117 |
+
"transformers_version": "4.53.3",
|
| 118 |
+
"lm_eval_version": "0.4.9.1",
|
| 119 |
+
"upper_git_hash": null,
|
| 120 |
+
"tokenizer_pad_token": [
|
| 121 |
+
"<|end_of_text|>",
|
| 122 |
+
"1"
|
| 123 |
+
],
|
| 124 |
+
"tokenizer_eos_token": [
|
| 125 |
+
"<|end_of_text|>",
|
| 126 |
+
"1"
|
| 127 |
+
],
|
| 128 |
+
"tokenizer_bos_token": [
|
| 129 |
+
"<|begin_of_text|>",
|
| 130 |
+
"0"
|
| 131 |
+
],
|
| 132 |
+
"eot_token_id": 1,
|
| 133 |
+
"max_length": 32768,
|
| 134 |
+
"task_hashes": {
|
| 135 |
+
"humaneval_64": "c122632f0bdffdd0162cc8033a879dfe9d3b35e82c52eda6a767069e0a2beb40"
|
| 136 |
+
},
|
| 137 |
+
"model_source": "vllm",
|
| 138 |
+
"model_name": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000/",
|
| 139 |
+
"model_name_sanitized": "__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__",
|
| 140 |
+
"system_instruction": null,
|
| 141 |
+
"system_instruction_sha": null,
|
| 142 |
+
"fewshot_as_multiturn": false,
|
| 143 |
+
"chat_template": null,
|
| 144 |
+
"chat_template_sha": null,
|
| 145 |
+
"start_time": 2356619.571626194,
|
| 146 |
+
"end_time": 2358988.00763084,
|
| 147 |
+
"total_evaluation_time_seconds": "2368.4360046461225"
|
| 148 |
+
}
|
iter_1249000/eval_results/humaneval_64_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000__/samples_humaneval_64_2025-11-25T01-35-23.860829.jsonl
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:33c4be2197c48883fb25516739ea127716b0d9ba8d2c370c17d3e6a94b27edfc
|
| 3 |
+
size 13131722
|
iter_1249000/eval_results/ifeval_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T20-13-52.015185.json
ADDED
|
@@ -0,0 +1,140 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"ifeval": {
|
| 4 |
+
"alias": "ifeval",
|
| 5 |
+
"prompt_level_strict_acc,none": 0.1256931608133087,
|
| 6 |
+
"prompt_level_strict_acc_stderr,none": 0.014265627567173898,
|
| 7 |
+
"inst_level_strict_acc,none": 0.22302158273381295,
|
| 8 |
+
"inst_level_strict_acc_stderr,none": "N/A",
|
| 9 |
+
"prompt_level_loose_acc,none": 0.133086876155268,
|
| 10 |
+
"prompt_level_loose_acc_stderr,none": 0.014617009342904457,
|
| 11 |
+
"inst_level_loose_acc,none": 0.23381294964028776,
|
| 12 |
+
"inst_level_loose_acc_stderr,none": "N/A"
|
| 13 |
+
}
|
| 14 |
+
},
|
| 15 |
+
"group_subtasks": {
|
| 16 |
+
"ifeval": []
|
| 17 |
+
},
|
| 18 |
+
"configs": {
|
| 19 |
+
"ifeval": {
|
| 20 |
+
"task": "ifeval",
|
| 21 |
+
"dataset_path": "google/IFEval",
|
| 22 |
+
"test_split": "train",
|
| 23 |
+
"doc_to_text": "prompt",
|
| 24 |
+
"doc_to_target": 0,
|
| 25 |
+
"unsafe_code": false,
|
| 26 |
+
"process_results": "def process_results(doc, results):\n inp = InputExample(\n key=doc[\"key\"],\n instruction_id_list=doc[\"instruction_id_list\"],\n prompt=doc[\"prompt\"],\n kwargs=doc[\"kwargs\"],\n )\n response = results[0]\n\n out_strict = test_instruction_following_strict(inp, response)\n out_loose = test_instruction_following_loose(inp, response)\n\n return {\n \"prompt_level_strict_acc\": out_strict.follow_all_instructions,\n \"inst_level_strict_acc\": out_strict.follow_instruction_list,\n \"prompt_level_loose_acc\": out_loose.follow_all_instructions,\n \"inst_level_loose_acc\": out_loose.follow_instruction_list,\n }\n",
|
| 27 |
+
"description": "",
|
| 28 |
+
"target_delimiter": " ",
|
| 29 |
+
"fewshot_delimiter": "\n\n",
|
| 30 |
+
"num_fewshot": 0,
|
| 31 |
+
"metric_list": [
|
| 32 |
+
{
|
| 33 |
+
"metric": "prompt_level_strict_acc",
|
| 34 |
+
"aggregation": "mean",
|
| 35 |
+
"higher_is_better": true
|
| 36 |
+
},
|
| 37 |
+
{
|
| 38 |
+
"metric": "inst_level_strict_acc",
|
| 39 |
+
"aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n",
|
| 40 |
+
"higher_is_better": true
|
| 41 |
+
},
|
| 42 |
+
{
|
| 43 |
+
"metric": "prompt_level_loose_acc",
|
| 44 |
+
"aggregation": "mean",
|
| 45 |
+
"higher_is_better": true
|
| 46 |
+
},
|
| 47 |
+
{
|
| 48 |
+
"metric": "inst_level_loose_acc",
|
| 49 |
+
"aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n",
|
| 50 |
+
"higher_is_better": true
|
| 51 |
+
}
|
| 52 |
+
],
|
| 53 |
+
"output_type": "generate_until",
|
| 54 |
+
"generation_kwargs": {
|
| 55 |
+
"until": [],
|
| 56 |
+
"do_sample": false,
|
| 57 |
+
"temperature": 0.0,
|
| 58 |
+
"max_gen_toks": 1280
|
| 59 |
+
},
|
| 60 |
+
"repeats": 1,
|
| 61 |
+
"should_decontaminate": false,
|
| 62 |
+
"metadata": {
|
| 63 |
+
"version": 4.0,
|
| 64 |
+
"pretrained": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
|
| 65 |
+
"tensor_parallel_size": 8,
|
| 66 |
+
"dtype": "float32",
|
| 67 |
+
"gpu_memory_utilization": 0.8
|
| 68 |
+
}
|
| 69 |
+
}
|
| 70 |
+
},
|
| 71 |
+
"versions": {
|
| 72 |
+
"ifeval": 4.0
|
| 73 |
+
},
|
| 74 |
+
"n-shot": {
|
| 75 |
+
"ifeval": 0
|
| 76 |
+
},
|
| 77 |
+
"higher_is_better": {
|
| 78 |
+
"ifeval": {
|
| 79 |
+
"prompt_level_strict_acc": true,
|
| 80 |
+
"inst_level_strict_acc": true,
|
| 81 |
+
"prompt_level_loose_acc": true,
|
| 82 |
+
"inst_level_loose_acc": true
|
| 83 |
+
}
|
| 84 |
+
},
|
| 85 |
+
"n-samples": {
|
| 86 |
+
"ifeval": {
|
| 87 |
+
"original": 541,
|
| 88 |
+
"effective": 541
|
| 89 |
+
}
|
| 90 |
+
},
|
| 91 |
+
"config": {
|
| 92 |
+
"model": "vllm",
|
| 93 |
+
"model_args": "pretrained=/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000,tensor_parallel_size=8,dtype=float32,gpu_memory_utilization=0.8",
|
| 94 |
+
"batch_size": "auto",
|
| 95 |
+
"batch_sizes": [],
|
| 96 |
+
"device": null,
|
| 97 |
+
"use_cache": null,
|
| 98 |
+
"limit": null,
|
| 99 |
+
"bootstrap_iters": 100000,
|
| 100 |
+
"gen_kwargs": null,
|
| 101 |
+
"random_seed": 0,
|
| 102 |
+
"numpy_seed": 1234,
|
| 103 |
+
"torch_seed": 1234,
|
| 104 |
+
"fewshot_seed": 1234
|
| 105 |
+
},
|
| 106 |
+
"git_hash": "a445a07",
|
| 107 |
+
"date": 1752782646.8382735,
|
| 108 |
+
"pretty_env_info": "PyTorch version: 2.7.0+cu126\nIs debug build: False\nCUDA used to build PyTorch: 12.6\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.31.7\nLibc version: glibc-2.35\n\nPython version: 3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:12:24) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1088-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H200\nGPU 1: NVIDIA H200\nGPU 2: NVIDIA H200\nGPU 3: NVIDIA H200\nGPU 4: NVIDIA H200\nGPU 5: NVIDIA H200\nGPU 6: NVIDIA H200\nGPU 7: NVIDIA H200\n\nNvidia driver version: 570.133.20\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8480C\nCPU family: 6\nModel: 143\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 8\nBogoMIPS: 4000.00\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 avx512vbmi umip waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid cldemote movdiri movdir64b fsrm serialize amx_bf16 avx512_fp16 amx_tile amx_int8 arch_capabilities\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 4.5 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 192 MiB (96 instances)\nL3 cache: 210 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47\nNUMA node1 CPU(s): 48-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Unknown: No mitigations\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Vulnerable\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Retpoline\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] flake8==7.0.0\n[pip3] flashinfer-python==0.2.5+cu126torch2.6\n[pip3] mypy==1.10.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] numpydoc==1.7.0\n[pip3] nvidia-cublas-cu12==12.6.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.6.80\n[pip3] nvidia-cuda-nvrtc-cu12==12.6.77\n[pip3] nvidia-cuda-runtime-cu12==12.6.77\n[pip3] nvidia-cudnn-cu12==9.5.1.17\n[pip3] nvidia-cufft-cu12==11.3.0.4\n[pip3] nvidia-curand-cu12==10.3.7.77\n[pip3] nvidia-cusolver-cu12==11.7.1.2\n[pip3] nvidia-cusparse-cu12==12.5.4.2\n[pip3] nvidia-cusparselt-cu12==0.6.3\n[pip3] nvidia-nccl-cu12==2.26.2\n[pip3] nvidia-nvjitlink-cu12==12.6.85\n[pip3] nvidia-nvtx-cu12==12.6.77\n[pip3] torch==2.7.0\n[pip3] torchaudio==2.7.0\n[pip3] torchvision==0.22.0\n[pip3] triton==3.3.0\n[conda] _anaconda_depends 2024.06 py312_mkl_2 \n[conda] blas 1.0 mkl \n[conda] flashinfer-python 0.2.5+cu126torch2.6 pypi_0 pypi\n[conda] mkl 2023.1.0 h213fc3f_46344 \n[conda] mkl-service 2.4.0 py312h5eee18b_1 \n[conda] mkl_fft 1.3.8 py312h5eee18b_0 \n[conda] mkl_random 1.2.4 py312hdb19cb5_0 \n[conda] numpy 1.26.4 py312hc5e2394_0 \n[conda] numpy-base 1.26.4 py312h0da6c21_0 \n[conda] numpydoc 1.7.0 py312h06a4308_0 \n[conda] nvidia-cublas-cu12 12.6.4.1 pypi_0 pypi\n[conda] nvidia-cuda-cupti-cu12 12.6.80 pypi_0 pypi\n[conda] nvidia-cuda-nvrtc-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cuda-runtime-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cudnn-cu12 9.5.1.17 pypi_0 pypi\n[conda] nvidia-cufft-cu12 11.3.0.4 pypi_0 pypi\n[conda] nvidia-curand-cu12 10.3.7.77 pypi_0 pypi\n[conda] nvidia-cusolver-cu12 11.7.1.2 pypi_0 pypi\n[conda] nvidia-cusparse-cu12 12.5.4.2 pypi_0 pypi\n[conda] nvidia-cusparselt-cu12 0.6.3 pypi_0 pypi\n[conda] nvidia-nccl-cu12 2.26.2 pypi_0 pypi\n[conda] nvidia-nvjitlink-cu12 12.6.85 pypi_0 pypi\n[conda] nvidia-nvtx-cu12 12.6.77 pypi_0 pypi\n[conda] torch 2.7.0 pypi_0 pypi\n[conda] torchaudio 2.7.0 pypi_0 pypi\n[conda] torchvision 0.22.0 pypi_0 pypi\n[conda] triton 3.3.0 pypi_0 pypi",
|
| 109 |
+
"transformers_version": "4.53.0.dev0",
|
| 110 |
+
"lm_eval_version": "0.4.8",
|
| 111 |
+
"upper_git_hash": null,
|
| 112 |
+
"tokenizer_pad_token": [
|
| 113 |
+
"<|end_of_text|>",
|
| 114 |
+
"1"
|
| 115 |
+
],
|
| 116 |
+
"tokenizer_eos_token": [
|
| 117 |
+
"<|end_of_text|>",
|
| 118 |
+
"1"
|
| 119 |
+
],
|
| 120 |
+
"tokenizer_bos_token": [
|
| 121 |
+
"<|begin_of_text|>",
|
| 122 |
+
"0"
|
| 123 |
+
],
|
| 124 |
+
"eot_token_id": 1,
|
| 125 |
+
"max_length": 32768,
|
| 126 |
+
"task_hashes": {
|
| 127 |
+
"ifeval": "a9cc24d7d92904c9f59225bb28b88b892d9ab82be222808ea7fa345ffd4500ae"
|
| 128 |
+
},
|
| 129 |
+
"model_source": "vllm",
|
| 130 |
+
"model_name": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
|
| 131 |
+
"model_name_sanitized": "__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000",
|
| 132 |
+
"system_instruction": null,
|
| 133 |
+
"system_instruction_sha": null,
|
| 134 |
+
"fewshot_as_multiturn": false,
|
| 135 |
+
"chat_template": null,
|
| 136 |
+
"chat_template_sha": null,
|
| 137 |
+
"start_time": 1473896.634510482,
|
| 138 |
+
"end_time": 1474487.194394837,
|
| 139 |
+
"total_evaluation_time_seconds": "590.5598843549378"
|
| 140 |
+
}
|
iter_1249000/eval_results/ifeval_0shots/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_ifeval_2025-07-17T20-13-52.015185.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
iter_1249000/eval_results/leaderboard_gpqa_diamond/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/results_2025-07-17T20-26-10.269030.json
ADDED
|
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"results": {
|
| 3 |
+
"leaderboard_gpqa_diamond": {
|
| 4 |
+
"alias": "leaderboard_gpqa_diamond",
|
| 5 |
+
"acc_norm,none": 0.26262626262626265,
|
| 6 |
+
"acc_norm_stderr,none": 0.031353050095330834
|
| 7 |
+
}
|
| 8 |
+
},
|
| 9 |
+
"group_subtasks": {
|
| 10 |
+
"leaderboard_gpqa_diamond": []
|
| 11 |
+
},
|
| 12 |
+
"configs": {
|
| 13 |
+
"leaderboard_gpqa_diamond": {
|
| 14 |
+
"task": "leaderboard_gpqa_diamond",
|
| 15 |
+
"dataset_path": "Idavidrein/gpqa",
|
| 16 |
+
"dataset_name": "gpqa_diamond",
|
| 17 |
+
"training_split": "train",
|
| 18 |
+
"validation_split": "train",
|
| 19 |
+
"process_docs": "def process_docs(dataset: datasets.Dataset) -> datasets.Dataset:\n def _process_doc(doc):\n choices = [\n preprocess(doc[\"Incorrect Answer 1\"]),\n preprocess(doc[\"Incorrect Answer 2\"]),\n preprocess(doc[\"Incorrect Answer 3\"]),\n preprocess(doc[\"Correct Answer\"]),\n ]\n\n random.shuffle(choices)\n correct_answer_index = choices.index(preprocess(doc[\"Correct Answer\"]))\n\n out_doc = {\n \"choice1\": choices[0],\n \"choice2\": choices[1],\n \"choice3\": choices[2],\n \"choice4\": choices[3],\n \"answer\": f\"({chr(65 + correct_answer_index)})\",\n }\n return out_doc\n\n return dataset.map(_process_doc)\n",
|
| 20 |
+
"doc_to_text": "What is the correct answer to this question:{{Question}}\nChoices:\n(A) {{choice1}}\n(B) {{choice2}}\n(C) {{choice3}}\n(D) {{choice4}}\nAnswer: ",
|
| 21 |
+
"doc_to_target": "answer",
|
| 22 |
+
"unsafe_code": false,
|
| 23 |
+
"doc_to_choice": [
|
| 24 |
+
"(A)",
|
| 25 |
+
"(B)",
|
| 26 |
+
"(C)",
|
| 27 |
+
"(D)"
|
| 28 |
+
],
|
| 29 |
+
"description": "",
|
| 30 |
+
"target_delimiter": " ",
|
| 31 |
+
"fewshot_delimiter": "\n\n",
|
| 32 |
+
"fewshot_config": {
|
| 33 |
+
"sampler": "first_n"
|
| 34 |
+
},
|
| 35 |
+
"num_fewshot": 0,
|
| 36 |
+
"metric_list": [
|
| 37 |
+
{
|
| 38 |
+
"metric": "acc_norm",
|
| 39 |
+
"aggregation": "mean",
|
| 40 |
+
"higher_is_better": true
|
| 41 |
+
}
|
| 42 |
+
],
|
| 43 |
+
"output_type": "multiple_choice",
|
| 44 |
+
"repeats": 1,
|
| 45 |
+
"should_decontaminate": false,
|
| 46 |
+
"metadata": {
|
| 47 |
+
"version": 1.0,
|
| 48 |
+
"pretrained": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
|
| 49 |
+
"tensor_parallel_size": 8,
|
| 50 |
+
"dtype": "float32",
|
| 51 |
+
"gpu_memory_utilization": 0.7
|
| 52 |
+
}
|
| 53 |
+
}
|
| 54 |
+
},
|
| 55 |
+
"versions": {
|
| 56 |
+
"leaderboard_gpqa_diamond": 1.0
|
| 57 |
+
},
|
| 58 |
+
"n-shot": {
|
| 59 |
+
"leaderboard_gpqa_diamond": 0
|
| 60 |
+
},
|
| 61 |
+
"higher_is_better": {
|
| 62 |
+
"leaderboard_gpqa_diamond": {
|
| 63 |
+
"acc_norm": true
|
| 64 |
+
}
|
| 65 |
+
},
|
| 66 |
+
"n-samples": {
|
| 67 |
+
"leaderboard_gpqa_diamond": {
|
| 68 |
+
"original": 198,
|
| 69 |
+
"effective": 198
|
| 70 |
+
}
|
| 71 |
+
},
|
| 72 |
+
"config": {
|
| 73 |
+
"model": "vllm",
|
| 74 |
+
"model_args": "pretrained=/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000,tensor_parallel_size=8,dtype=float32,gpu_memory_utilization=0.7",
|
| 75 |
+
"batch_size": "1",
|
| 76 |
+
"batch_sizes": [],
|
| 77 |
+
"device": null,
|
| 78 |
+
"use_cache": null,
|
| 79 |
+
"limit": null,
|
| 80 |
+
"bootstrap_iters": 100000,
|
| 81 |
+
"gen_kwargs": null,
|
| 82 |
+
"random_seed": 0,
|
| 83 |
+
"numpy_seed": 1234,
|
| 84 |
+
"torch_seed": 1234,
|
| 85 |
+
"fewshot_seed": 1234
|
| 86 |
+
},
|
| 87 |
+
"git_hash": "a445a07",
|
| 88 |
+
"date": 1752783576.6566072,
|
| 89 |
+
"pretty_env_info": "PyTorch version: 2.7.0+cu126\nIs debug build: False\nCUDA used to build PyTorch: 12.6\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.31.7\nLibc version: glibc-2.35\n\nPython version: 3.12.4 | packaged by Anaconda, Inc. | (main, Jun 18 2024, 15:12:24) [GCC 11.2.0] (64-bit runtime)\nPython platform: Linux-5.15.0-1088-azure-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA H200\nGPU 1: NVIDIA H200\nGPU 2: NVIDIA H200\nGPU 3: NVIDIA H200\nGPU 4: NVIDIA H200\nGPU 5: NVIDIA H200\nGPU 6: NVIDIA H200\nGPU 7: NVIDIA H200\n\nNvidia driver version: 570.133.20\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 46 bits physical, 57 bits virtual\nByte Order: Little Endian\nCPU(s): 96\nOn-line CPU(s) list: 0-95\nVendor ID: GenuineIntel\nModel name: Intel(R) Xeon(R) Platinum 8480C\nCPU family: 6\nModel: 143\nThread(s) per core: 1\nCore(s) per socket: 48\nSocket(s): 2\nStepping: 8\nBogoMIPS: 3999.99\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology tsc_reliable nonstop_tsc cpuid aperfmperf tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt tsc_deadline_timer aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm avx512f avx512dq rdseed adx smap avx512ifma clflushopt clwb avx512cd sha_ni avx512bw avx512vl xsaveopt xsavec xgetbv1 xsaves avx_vnni avx512_bf16 avx512vbmi umip waitpkg avx512_vbmi2 gfni vaes vpclmulqdq avx512_vnni avx512_bitalg avx512_vpopcntdq la57 rdpid cldemote movdiri movdir64b fsrm serialize amx_bf16 avx512_fp16 amx_tile amx_int8 arch_capabilities\nHypervisor vendor: Microsoft\nVirtualization type: full\nL1d cache: 4.5 MiB (96 instances)\nL1i cache: 3 MiB (96 instances)\nL2 cache: 192 MiB (96 instances)\nL3 cache: 210 MiB (2 instances)\nNUMA node(s): 2\nNUMA node0 CPU(s): 0-47\nNUMA node1 CPU(s): 48-95\nVulnerability Gather data sampling: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Unknown: No mitigations\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Vulnerable\nVulnerability Spec rstack overflow: Not affected\nVulnerability Spec store bypass: Vulnerable\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; STIBP disabled; RSB filling; PBRSB-eIBRS Not affected; BHI Retpoline\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] flake8==7.0.0\n[pip3] flashinfer-python==0.2.5+cu126torch2.6\n[pip3] mypy==1.10.0\n[pip3] mypy-extensions==1.0.0\n[pip3] numpy==1.26.4\n[pip3] numpydoc==1.7.0\n[pip3] nvidia-cublas-cu12==12.6.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.6.80\n[pip3] nvidia-cuda-nvrtc-cu12==12.6.77\n[pip3] nvidia-cuda-runtime-cu12==12.6.77\n[pip3] nvidia-cudnn-cu12==9.5.1.17\n[pip3] nvidia-cufft-cu12==11.3.0.4\n[pip3] nvidia-curand-cu12==10.3.7.77\n[pip3] nvidia-cusolver-cu12==11.7.1.2\n[pip3] nvidia-cusparse-cu12==12.5.4.2\n[pip3] nvidia-cusparselt-cu12==0.6.3\n[pip3] nvidia-nccl-cu12==2.26.2\n[pip3] nvidia-nvjitlink-cu12==12.6.85\n[pip3] nvidia-nvtx-cu12==12.6.77\n[pip3] torch==2.7.0\n[pip3] torchaudio==2.7.0\n[pip3] torchvision==0.22.0\n[pip3] triton==3.3.0\n[conda] _anaconda_depends 2024.06 py312_mkl_2 \n[conda] blas 1.0 mkl \n[conda] flashinfer-python 0.2.5+cu126torch2.6 pypi_0 pypi\n[conda] mkl 2023.1.0 h213fc3f_46344 \n[conda] mkl-service 2.4.0 py312h5eee18b_1 \n[conda] mkl_fft 1.3.8 py312h5eee18b_0 \n[conda] mkl_random 1.2.4 py312hdb19cb5_0 \n[conda] numpy 1.26.4 py312hc5e2394_0 \n[conda] numpy-base 1.26.4 py312h0da6c21_0 \n[conda] numpydoc 1.7.0 py312h06a4308_0 \n[conda] nvidia-cublas-cu12 12.6.4.1 pypi_0 pypi\n[conda] nvidia-cuda-cupti-cu12 12.6.80 pypi_0 pypi\n[conda] nvidia-cuda-nvrtc-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cuda-runtime-cu12 12.6.77 pypi_0 pypi\n[conda] nvidia-cudnn-cu12 9.5.1.17 pypi_0 pypi\n[conda] nvidia-cufft-cu12 11.3.0.4 pypi_0 pypi\n[conda] nvidia-curand-cu12 10.3.7.77 pypi_0 pypi\n[conda] nvidia-cusolver-cu12 11.7.1.2 pypi_0 pypi\n[conda] nvidia-cusparse-cu12 12.5.4.2 pypi_0 pypi\n[conda] nvidia-cusparselt-cu12 0.6.3 pypi_0 pypi\n[conda] nvidia-nccl-cu12 2.26.2 pypi_0 pypi\n[conda] nvidia-nvjitlink-cu12 12.6.85 pypi_0 pypi\n[conda] nvidia-nvtx-cu12 12.6.77 pypi_0 pypi\n[conda] torch 2.7.0 pypi_0 pypi\n[conda] torchaudio 2.7.0 pypi_0 pypi\n[conda] torchvision 0.22.0 pypi_0 pypi\n[conda] triton 3.3.0 pypi_0 pypi",
|
| 90 |
+
"transformers_version": "4.53.0.dev0",
|
| 91 |
+
"lm_eval_version": "0.4.8",
|
| 92 |
+
"upper_git_hash": null,
|
| 93 |
+
"tokenizer_pad_token": [
|
| 94 |
+
"<|end_of_text|>",
|
| 95 |
+
"1"
|
| 96 |
+
],
|
| 97 |
+
"tokenizer_eos_token": [
|
| 98 |
+
"<|end_of_text|>",
|
| 99 |
+
"1"
|
| 100 |
+
],
|
| 101 |
+
"tokenizer_bos_token": [
|
| 102 |
+
"<|begin_of_text|>",
|
| 103 |
+
"0"
|
| 104 |
+
],
|
| 105 |
+
"eot_token_id": 1,
|
| 106 |
+
"max_length": 32768,
|
| 107 |
+
"task_hashes": {
|
| 108 |
+
"leaderboard_gpqa_diamond": "45f449f3b3dfc0be532cf1913f1559cf9d7645e56ec3c3fe01317fc575a54e3d"
|
| 109 |
+
},
|
| 110 |
+
"model_source": "vllm",
|
| 111 |
+
"model_name": "/lustrefs/users/runner/checkpoints/huggingface/vocab_trimmed/iter_1249000",
|
| 112 |
+
"model_name_sanitized": "__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000",
|
| 113 |
+
"system_instruction": null,
|
| 114 |
+
"system_instruction_sha": null,
|
| 115 |
+
"fewshot_as_multiturn": false,
|
| 116 |
+
"chat_template": null,
|
| 117 |
+
"chat_template_sha": null,
|
| 118 |
+
"start_time": 1474830.813494996,
|
| 119 |
+
"end_time": 1475231.21819884,
|
| 120 |
+
"total_evaluation_time_seconds": "400.4047038441058"
|
| 121 |
+
}
|
iter_1249000/eval_results/leaderboard_gpqa_diamond/__lustrefs__users__runner__checkpoints__huggingface__vocab_trimmed__iter_1249000/samples_leaderboard_gpqa_diamond_2025-07-17T20-26-10.269030.jsonl
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|