{ "results": { "ifeval": { "alias": "ifeval", "prompt_level_strict_acc,none": 0.7985212569316081, "prompt_level_strict_acc_stderr,none": 0.017260802262371536, "inst_level_strict_acc,none": 0.8585131894484412, "inst_level_strict_acc_stderr,none": "N/A", "prompt_level_loose_acc,none": 0.8354898336414048, "prompt_level_loose_acc_stderr,none": 0.015954017926718075, "inst_level_loose_acc,none": 0.8860911270983214, "inst_level_loose_acc_stderr,none": "N/A" }, "mmlu": { "acc,none": 0.7615585926141321, "acc_stderr,none": 0.0030901010446613567, "alias": "mmlu" }, "mmlu_humanities": { "acc,none": 0.7151965993623804, "acc_stderr,none": 0.006309318231913105, "alias": " - humanities" }, "mmlu_formal_logic": { "alias": " - formal_logic", "acc,none": 0.5873015873015873, "acc_stderr,none": 0.04403438954768178 }, "mmlu_high_school_european_history": { "alias": " - high_school_european_history", "acc,none": 0.8545454545454545, "acc_stderr,none": 0.02753019635506653 }, "mmlu_high_school_us_history": { "alias": " - high_school_us_history", "acc,none": 0.9411764705882353, "acc_stderr,none": 0.016514409561025796 }, "mmlu_high_school_world_history": { "alias": " - high_school_world_history", "acc,none": 0.8987341772151899, "acc_stderr,none": 0.019637720526065494 }, "mmlu_international_law": { "alias": " - international_law", "acc,none": 0.8677685950413223, "acc_stderr,none": 0.030922788320445795 }, "mmlu_jurisprudence": { "alias": " - jurisprudence", "acc,none": 0.8703703703703703, "acc_stderr,none": 0.032472243899179506 }, "mmlu_logical_fallacies": { "alias": " - logical_fallacies", "acc,none": 0.8773006134969326, "acc_stderr,none": 0.025777328426978916 }, "mmlu_moral_disputes": { "alias": " - moral_disputes", "acc,none": 0.8063583815028902, "acc_stderr,none": 0.02127423031751547 }, "mmlu_moral_scenarios": { "alias": " - moral_scenarios", "acc,none": 0.5899441340782123, "acc_stderr,none": 0.016449708209026186 }, "mmlu_philosophy": { "alias": " - philosophy", "acc,none": 0.7942122186495176, "acc_stderr,none": 0.02296133990676421 }, "mmlu_prehistory": { "alias": " - prehistory", "acc,none": 0.8364197530864198, "acc_stderr,none": 0.020581466138257107 }, "mmlu_professional_law": { "alias": " - professional_law", "acc,none": 0.605606258148631, "acc_stderr,none": 0.01248214166563135 }, "mmlu_world_religions": { "alias": " - world_religions", "acc,none": 0.8713450292397661, "acc_stderr,none": 0.02567934272327688 }, "mmlu_other": { "acc,none": 0.8258770518184744, "acc_stderr,none": 0.006523878593984539, "alias": " - other" }, "mmlu_business_ethics": { "alias": " - business_ethics", "acc,none": 0.82, "acc_stderr,none": 0.03861229196653691 }, "mmlu_clinical_knowledge": { "alias": " - clinical_knowledge", "acc,none": 0.8301886792452831, "acc_stderr,none": 0.023108393799841306 }, "mmlu_college_medicine": { "alias": " - college_medicine", "acc,none": 0.7630057803468208, "acc_stderr,none": 0.03242414757483098 }, "mmlu_global_facts": { "alias": " - global_facts", "acc,none": 0.59, "acc_stderr,none": 0.04943110704237104 }, "mmlu_human_aging": { "alias": " - human_aging", "acc,none": 0.8340807174887892, "acc_stderr,none": 0.024967553196547185 }, "mmlu_management": { "alias": " - management", "acc,none": 0.8543689320388349, "acc_stderr,none": 0.034926064766237934 }, "mmlu_marketing": { "alias": " - marketing", "acc,none": 0.9316239316239316, "acc_stderr,none": 0.016534627684311385 }, "mmlu_medical_genetics": { "alias": " - medical_genetics", "acc,none": 0.89, "acc_stderr,none": 0.031446603773522014 }, "mmlu_miscellaneous": { "alias": " - miscellaneous", "acc,none": 0.9195402298850575, "acc_stderr,none": 0.009726831316141792 }, "mmlu_nutrition": { "alias": " - nutrition", "acc,none": 0.8594771241830066, "acc_stderr,none": 0.019899435463539943 }, "mmlu_professional_accounting": { "alias": " - professional_accounting", "acc,none": 0.6418439716312057, "acc_stderr,none": 0.028602085862759398 }, "mmlu_professional_medicine": { "alias": " - professional_medicine", "acc,none": 0.8602941176470589, "acc_stderr,none": 0.021059408919012472 }, "mmlu_virology": { "alias": " - virology", "acc,none": 0.5662650602409639, "acc_stderr,none": 0.03858158940685519 }, "mmlu_social_sciences": { "acc,none": 0.861878453038674, "acc_stderr,none": 0.006145727731269134, "alias": " - social sciences" }, "mmlu_econometrics": { "alias": " - econometrics", "acc,none": 0.6842105263157895, "acc_stderr,none": 0.04372748290278002 }, "mmlu_high_school_geography": { "alias": " - high_school_geography", "acc,none": 0.8939393939393939, "acc_stderr,none": 0.02193804773885311 }, "mmlu_high_school_government_and_politics": { "alias": " - high_school_government_and_politics", "acc,none": 0.9533678756476683, "acc_stderr,none": 0.015216761819262604 }, "mmlu_high_school_macroeconomics": { "alias": " - high_school_macroeconomics", "acc,none": 0.8076923076923077, "acc_stderr,none": 0.019982347208637313 }, "mmlu_high_school_microeconomics": { "alias": " - high_school_microeconomics", "acc,none": 0.9117647058823529, "acc_stderr,none": 0.018424189175962204 }, "mmlu_high_school_psychology": { "alias": " - high_school_psychology", "acc,none": 0.9119266055045872, "acc_stderr,none": 0.012150743719481636 }, "mmlu_human_sexuality": { "alias": " - human_sexuality", "acc,none": 0.8702290076335878, "acc_stderr,none": 0.02947364949690705 }, "mmlu_professional_psychology": { "alias": " - professional_psychology", "acc,none": 0.8382352941176471, "acc_stderr,none": 0.014897186424298686 }, "mmlu_public_relations": { "alias": " - public_relations", "acc,none": 0.8, "acc_stderr,none": 0.038313051408846055 }, "mmlu_security_studies": { "alias": " - security_studies", "acc,none": 0.8204081632653061, "acc_stderr,none": 0.024573293589585647 }, "mmlu_sociology": { "alias": " - sociology", "acc,none": 0.8756218905472637, "acc_stderr,none": 0.023335401790166316 }, "mmlu_us_foreign_policy": { "alias": " - us_foreign_policy", "acc,none": 0.92, "acc_stderr,none": 0.027265992434429072 }, "mmlu_stem": { "acc,none": 0.7155090390104663, "acc_stderr,none": 0.005432838781581167, "alias": "stem" }, "mmlu_abstract_algebra": { "alias": " - abstract_algebra", "acc,none": 0.52, "acc_stderr,none": 0.05021167315686783 }, "mmlu_anatomy": { "alias": " - anatomy", "acc,none": 0.8148148148148148, "acc_stderr,none": 0.03355677216313143 }, "mmlu_astronomy": { "alias": " - astronomy", "acc,none": 0.8947368421052632, "acc_stderr,none": 0.02497453345092068 }, "mmlu_college_biology": { "alias": " - college_biology", "acc,none": 0.875, "acc_stderr,none": 0.02765610492929436 }, "mmlu_college_chemistry": { "alias": " - college_chemistry", "acc,none": 0.54, "acc_stderr,none": 0.05009082659620331 }, "mmlu_college_computer_science": { "alias": " - college_computer_science", "acc,none": 0.67, "acc_stderr,none": 0.04725815626252609 }, "mmlu_college_mathematics": { "alias": " - college_mathematics", "acc,none": 0.43, "acc_stderr,none": 0.049756985195624305 }, "mmlu_college_physics": { "alias": " - college_physics", "acc,none": 0.6078431372549019, "acc_stderr,none": 0.04858083574266346 }, "mmlu_computer_security": { "alias": " - computer_security", "acc,none": 0.77, "acc_stderr,none": 0.042295258468165065 }, "mmlu_conceptual_physics": { "alias": " - conceptual_physics", "acc,none": 0.8042553191489362, "acc_stderr,none": 0.025937853139977127 }, "mmlu_electrical_engineering": { "alias": " - electrical_engineering", "acc,none": 0.7172413793103448, "acc_stderr,none": 0.03752833958003335 }, "mmlu_elementary_mathematics": { "alias": " - elementary_mathematics", "acc,none": 0.7275132275132276, "acc_stderr,none": 0.022930973071633363 }, "mmlu_high_school_biology": { "alias": " - high_school_biology", "acc,none": 0.8838709677419355, "acc_stderr,none": 0.01822575794943229 }, "mmlu_high_school_chemistry": { "alias": " - high_school_chemistry", "acc,none": 0.729064039408867, "acc_stderr,none": 0.03127090713297701 }, "mmlu_high_school_computer_science": { "alias": " - high_school_computer_science", "acc,none": 0.89, "acc_stderr,none": 0.031446603773522014 }, "mmlu_high_school_mathematics": { "alias": " - high_school_mathematics", "acc,none": 0.46296296296296297, "acc_stderr,none": 0.03040178640610153 }, "mmlu_high_school_physics": { "alias": " - high_school_physics", "acc,none": 0.6490066225165563, "acc_stderr,none": 0.038969819642573705 }, "mmlu_high_school_statistics": { "alias": " - high_school_statistics", "acc,none": 0.6990740740740741, "acc_stderr,none": 0.03128039084329886 }, "mmlu_machine_learning": { "alias": " - machine_learning", "acc,none": 0.6785714285714286, "acc_stderr,none": 0.044328040552915185 } }, "groups": { "mmlu": { "acc,none": 0.7615585926141321, "acc_stderr,none": 0.0030901010446613567, "alias": "mmlu" }, "mmlu_humanities": { "acc,none": 0.7151965993623804, "acc_stderr,none": 0.006309318231913105, "alias": " - humanities" }, "mmlu_other": { "acc,none": 0.8258770518184744, "acc_stderr,none": 0.006523878593984539, "alias": " - other" }, "mmlu_social_sciences": { "acc,none": 0.861878453038674, "acc_stderr,none": 0.006145727731269134, "alias": " - social sciences" }, "mmlu_stem": { "acc,none": 0.7155090390104663, "acc_stderr,none": 0.005432838781581167, "alias": "stem" } }, "group_subtasks": { "ifeval": [], "mmlu_humanities": [ "mmlu_philosophy", "mmlu_high_school_us_history", "mmlu_logical_fallacies", "mmlu_high_school_european_history", "mmlu_high_school_world_history", "mmlu_jurisprudence", "mmlu_prehistory", "mmlu_world_religions", "mmlu_moral_disputes", "mmlu_professional_law", "mmlu_international_law", "mmlu_formal_logic", "mmlu_moral_scenarios" ], "mmlu_social_sciences": [ "mmlu_high_school_psychology", "mmlu_econometrics", "mmlu_professional_psychology", "mmlu_high_school_microeconomics", "mmlu_security_studies", "mmlu_high_school_government_and_politics", "mmlu_human_sexuality", "mmlu_sociology", "mmlu_public_relations", "mmlu_us_foreign_policy", "mmlu_high_school_geography", "mmlu_high_school_macroeconomics" ], "mmlu_other": [ "mmlu_business_ethics", "mmlu_global_facts", "mmlu_professional_accounting", "mmlu_professional_medicine", "mmlu_miscellaneous", "mmlu_marketing", "mmlu_nutrition", "mmlu_virology", "mmlu_clinical_knowledge", "mmlu_management", "mmlu_college_medicine", "mmlu_medical_genetics", "mmlu_human_aging" ], "mmlu": [ "mmlu_stem", "mmlu_other", "mmlu_social_sciences", "mmlu_humanities" ], "mmlu_stem": [ "mmlu_college_mathematics", "mmlu_college_computer_science", "mmlu_electrical_engineering", "mmlu_astronomy", "mmlu_college_biology", "mmlu_elementary_mathematics", "mmlu_high_school_computer_science", "mmlu_college_chemistry", "mmlu_anatomy", "mmlu_computer_security", "mmlu_high_school_chemistry", "mmlu_abstract_algebra", "mmlu_high_school_mathematics", "mmlu_high_school_biology", "mmlu_machine_learning", "mmlu_college_physics", "mmlu_high_school_physics", "mmlu_high_school_statistics", "mmlu_conceptual_physics" ] }, "configs": { "ifeval": { "task": "ifeval", "dataset_path": "google/IFEval", "test_split": "train", "doc_to_text": "prompt", "doc_to_target": 0, "unsafe_code": false, "process_results": "def process_results(doc, results):\n inp = InputExample(\n key=doc[\"key\"],\n instruction_id_list=doc[\"instruction_id_list\"],\n prompt=doc[\"prompt\"],\n kwargs=doc[\"kwargs\"],\n )\n response = results[0]\n\n out_strict = test_instruction_following_strict(inp, response)\n out_loose = test_instruction_following_loose(inp, response)\n\n return {\n \"prompt_level_strict_acc\": out_strict.follow_all_instructions,\n \"inst_level_strict_acc\": out_strict.follow_instruction_list,\n \"prompt_level_loose_acc\": out_loose.follow_all_instructions,\n \"inst_level_loose_acc\": out_loose.follow_instruction_list,\n }\n", "description": "", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "num_fewshot": 0, "metric_list": [ { "metric": "prompt_level_strict_acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "inst_level_strict_acc", "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n", "higher_is_better": true }, { "metric": "prompt_level_loose_acc", "aggregation": "mean", "higher_is_better": true }, { "metric": "inst_level_loose_acc", "aggregation": "def agg_inst_level_acc(items):\n flat_items = [item for sublist in items for item in sublist]\n inst_level_acc = sum(flat_items) / len(flat_items)\n return inst_level_acc\n", "higher_is_better": true } ], "output_type": "generate_until", "generation_kwargs": { "until": [], "do_sample": false, "temperature": 0.0, "max_gen_toks": 1280 }, "repeats": 1, "should_decontaminate": false, "metadata": { "version": 4.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_abstract_algebra": { "task": "mmlu_abstract_algebra", "task_alias": "abstract_algebra", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "abstract_algebra", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about abstract algebra.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_anatomy": { "task": "mmlu_anatomy", "task_alias": "anatomy", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "anatomy", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about anatomy.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_astronomy": { "task": "mmlu_astronomy", "task_alias": "astronomy", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "astronomy", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about astronomy.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_business_ethics": { "task": "mmlu_business_ethics", "task_alias": "business_ethics", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "business_ethics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about business ethics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_clinical_knowledge": { "task": "mmlu_clinical_knowledge", "task_alias": "clinical_knowledge", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "clinical_knowledge", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about clinical knowledge.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_college_biology": { "task": "mmlu_college_biology", "task_alias": "college_biology", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "college_biology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college biology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_college_chemistry": { "task": "mmlu_college_chemistry", "task_alias": "college_chemistry", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "college_chemistry", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college chemistry.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_college_computer_science": { "task": "mmlu_college_computer_science", "task_alias": "college_computer_science", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "college_computer_science", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college computer science.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_college_mathematics": { "task": "mmlu_college_mathematics", "task_alias": "college_mathematics", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "college_mathematics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college mathematics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_college_medicine": { "task": "mmlu_college_medicine", "task_alias": "college_medicine", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "college_medicine", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college medicine.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_college_physics": { "task": "mmlu_college_physics", "task_alias": "college_physics", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "college_physics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about college physics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_computer_security": { "task": "mmlu_computer_security", "task_alias": "computer_security", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "computer_security", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about computer security.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_conceptual_physics": { "task": "mmlu_conceptual_physics", "task_alias": "conceptual_physics", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "conceptual_physics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about conceptual physics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_econometrics": { "task": "mmlu_econometrics", "task_alias": "econometrics", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "econometrics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about econometrics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_electrical_engineering": { "task": "mmlu_electrical_engineering", "task_alias": "electrical_engineering", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "electrical_engineering", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about electrical engineering.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_elementary_mathematics": { "task": "mmlu_elementary_mathematics", "task_alias": "elementary_mathematics", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "elementary_mathematics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about elementary mathematics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_formal_logic": { "task": "mmlu_formal_logic", "task_alias": "formal_logic", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "formal_logic", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about formal logic.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_global_facts": { "task": "mmlu_global_facts", "task_alias": "global_facts", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "global_facts", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about global facts.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_high_school_biology": { "task": "mmlu_high_school_biology", "task_alias": "high_school_biology", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_biology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school biology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_high_school_chemistry": { "task": "mmlu_high_school_chemistry", "task_alias": "high_school_chemistry", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_chemistry", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school chemistry.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_high_school_computer_science": { "task": "mmlu_high_school_computer_science", "task_alias": "high_school_computer_science", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_computer_science", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school computer science.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_high_school_european_history": { "task": "mmlu_high_school_european_history", "task_alias": "high_school_european_history", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_european_history", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school european history.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_high_school_geography": { "task": "mmlu_high_school_geography", "task_alias": "high_school_geography", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_geography", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school geography.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_high_school_government_and_politics": { "task": "mmlu_high_school_government_and_politics", "task_alias": "high_school_government_and_politics", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_government_and_politics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school government and politics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_high_school_macroeconomics": { "task": "mmlu_high_school_macroeconomics", "task_alias": "high_school_macroeconomics", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_macroeconomics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school macroeconomics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_high_school_mathematics": { "task": "mmlu_high_school_mathematics", "task_alias": "high_school_mathematics", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_mathematics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school mathematics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_high_school_microeconomics": { "task": "mmlu_high_school_microeconomics", "task_alias": "high_school_microeconomics", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_microeconomics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school microeconomics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_high_school_physics": { "task": "mmlu_high_school_physics", "task_alias": "high_school_physics", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_physics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school physics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_high_school_psychology": { "task": "mmlu_high_school_psychology", "task_alias": "high_school_psychology", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_psychology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school psychology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_high_school_statistics": { "task": "mmlu_high_school_statistics", "task_alias": "high_school_statistics", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_statistics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school statistics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_high_school_us_history": { "task": "mmlu_high_school_us_history", "task_alias": "high_school_us_history", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_us_history", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school us history.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_high_school_world_history": { "task": "mmlu_high_school_world_history", "task_alias": "high_school_world_history", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "high_school_world_history", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about high school world history.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_human_aging": { "task": "mmlu_human_aging", "task_alias": "human_aging", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "human_aging", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about human aging.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_human_sexuality": { "task": "mmlu_human_sexuality", "task_alias": "human_sexuality", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "human_sexuality", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about human sexuality.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_international_law": { "task": "mmlu_international_law", "task_alias": "international_law", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "international_law", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about international law.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_jurisprudence": { "task": "mmlu_jurisprudence", "task_alias": "jurisprudence", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "jurisprudence", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about jurisprudence.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_logical_fallacies": { "task": "mmlu_logical_fallacies", "task_alias": "logical_fallacies", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "logical_fallacies", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about logical fallacies.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_machine_learning": { "task": "mmlu_machine_learning", "task_alias": "machine_learning", "tag": "mmlu_stem_tasks", "dataset_path": "cais/mmlu", "dataset_name": "machine_learning", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about machine learning.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_management": { "task": "mmlu_management", "task_alias": "management", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "management", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about management.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_marketing": { "task": "mmlu_marketing", "task_alias": "marketing", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "marketing", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about marketing.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_medical_genetics": { "task": "mmlu_medical_genetics", "task_alias": "medical_genetics", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "medical_genetics", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about medical genetics.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_miscellaneous": { "task": "mmlu_miscellaneous", "task_alias": "miscellaneous", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "miscellaneous", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about miscellaneous.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_moral_disputes": { "task": "mmlu_moral_disputes", "task_alias": "moral_disputes", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "moral_disputes", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about moral disputes.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_moral_scenarios": { "task": "mmlu_moral_scenarios", "task_alias": "moral_scenarios", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "moral_scenarios", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about moral scenarios.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_nutrition": { "task": "mmlu_nutrition", "task_alias": "nutrition", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "nutrition", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about nutrition.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_philosophy": { "task": "mmlu_philosophy", "task_alias": "philosophy", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "philosophy", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about philosophy.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_prehistory": { "task": "mmlu_prehistory", "task_alias": "prehistory", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "prehistory", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about prehistory.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_professional_accounting": { "task": "mmlu_professional_accounting", "task_alias": "professional_accounting", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "professional_accounting", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about professional accounting.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_professional_law": { "task": "mmlu_professional_law", "task_alias": "professional_law", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "professional_law", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about professional law.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_professional_medicine": { "task": "mmlu_professional_medicine", "task_alias": "professional_medicine", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "professional_medicine", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about professional medicine.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_professional_psychology": { "task": "mmlu_professional_psychology", "task_alias": "professional_psychology", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "professional_psychology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about professional psychology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_public_relations": { "task": "mmlu_public_relations", "task_alias": "public_relations", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "public_relations", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about public relations.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_security_studies": { "task": "mmlu_security_studies", "task_alias": "security_studies", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "security_studies", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about security studies.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_sociology": { "task": "mmlu_sociology", "task_alias": "sociology", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "sociology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about sociology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_us_foreign_policy": { "task": "mmlu_us_foreign_policy", "task_alias": "us_foreign_policy", "tag": "mmlu_social_sciences_tasks", "dataset_path": "cais/mmlu", "dataset_name": "us_foreign_policy", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about us foreign policy.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_virology": { "task": "mmlu_virology", "task_alias": "virology", "tag": "mmlu_other_tasks", "dataset_path": "cais/mmlu", "dataset_name": "virology", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about virology.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } }, "mmlu_world_religions": { "task": "mmlu_world_religions", "task_alias": "world_religions", "tag": "mmlu_humanities_tasks", "dataset_path": "cais/mmlu", "dataset_name": "world_religions", "test_split": "test", "fewshot_split": "dev", "doc_to_text": "{{question.strip()}}\nA. {{choices[0]}}\nB. {{choices[1]}}\nC. {{choices[2]}}\nD. {{choices[3]}}\nAnswer:", "doc_to_target": "answer", "unsafe_code": false, "doc_to_choice": [ "A", "B", "C", "D" ], "description": "The following are multiple choice questions (with answers) about world religions.\n\n", "target_delimiter": " ", "fewshot_delimiter": "\n\n", "fewshot_config": { "sampler": "first_n" }, "num_fewshot": 0, "metric_list": [ { "metric": "acc", "aggregation": "mean", "higher_is_better": true } ], "output_type": "multiple_choice", "repeats": 1, "should_decontaminate": false, "metadata": { "version": 1.0, "pretrained": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "tokenizer": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "max_gen_toks": 4096, "max_model_len": 8192, "enable_prefix_caching": true, "enable_chunked_prefill": true, "tensor_parallel_size": 2 } } }, "versions": { "ifeval": 4.0, "mmlu": 2, "mmlu_abstract_algebra": 1.0, "mmlu_anatomy": 1.0, "mmlu_astronomy": 1.0, "mmlu_business_ethics": 1.0, "mmlu_clinical_knowledge": 1.0, "mmlu_college_biology": 1.0, "mmlu_college_chemistry": 1.0, "mmlu_college_computer_science": 1.0, "mmlu_college_mathematics": 1.0, "mmlu_college_medicine": 1.0, "mmlu_college_physics": 1.0, "mmlu_computer_security": 1.0, "mmlu_conceptual_physics": 1.0, "mmlu_econometrics": 1.0, "mmlu_electrical_engineering": 1.0, "mmlu_elementary_mathematics": 1.0, "mmlu_formal_logic": 1.0, "mmlu_global_facts": 1.0, "mmlu_high_school_biology": 1.0, "mmlu_high_school_chemistry": 1.0, "mmlu_high_school_computer_science": 1.0, "mmlu_high_school_european_history": 1.0, "mmlu_high_school_geography": 1.0, "mmlu_high_school_government_and_politics": 1.0, "mmlu_high_school_macroeconomics": 1.0, "mmlu_high_school_mathematics": 1.0, "mmlu_high_school_microeconomics": 1.0, "mmlu_high_school_physics": 1.0, "mmlu_high_school_psychology": 1.0, "mmlu_high_school_statistics": 1.0, "mmlu_high_school_us_history": 1.0, "mmlu_high_school_world_history": 1.0, "mmlu_human_aging": 1.0, "mmlu_human_sexuality": 1.0, "mmlu_humanities": 2, "mmlu_international_law": 1.0, "mmlu_jurisprudence": 1.0, "mmlu_logical_fallacies": 1.0, "mmlu_machine_learning": 1.0, "mmlu_management": 1.0, "mmlu_marketing": 1.0, "mmlu_medical_genetics": 1.0, "mmlu_miscellaneous": 1.0, "mmlu_moral_disputes": 1.0, "mmlu_moral_scenarios": 1.0, "mmlu_nutrition": 1.0, "mmlu_other": 2, "mmlu_philosophy": 1.0, "mmlu_prehistory": 1.0, "mmlu_professional_accounting": 1.0, "mmlu_professional_law": 1.0, "mmlu_professional_medicine": 1.0, "mmlu_professional_psychology": 1.0, "mmlu_public_relations": 1.0, "mmlu_security_studies": 1.0, "mmlu_social_sciences": 2, "mmlu_sociology": 1.0, "mmlu_stem": 2, "mmlu_us_foreign_policy": 1.0, "mmlu_virology": 1.0, "mmlu_world_religions": 1.0 }, "n-shot": { "ifeval": 0, "mmlu_abstract_algebra": 0, "mmlu_anatomy": 0, "mmlu_astronomy": 0, "mmlu_business_ethics": 0, "mmlu_clinical_knowledge": 0, "mmlu_college_biology": 0, "mmlu_college_chemistry": 0, "mmlu_college_computer_science": 0, "mmlu_college_mathematics": 0, "mmlu_college_medicine": 0, "mmlu_college_physics": 0, "mmlu_computer_security": 0, "mmlu_conceptual_physics": 0, "mmlu_econometrics": 0, "mmlu_electrical_engineering": 0, "mmlu_elementary_mathematics": 0, "mmlu_formal_logic": 0, "mmlu_global_facts": 0, "mmlu_high_school_biology": 0, "mmlu_high_school_chemistry": 0, "mmlu_high_school_computer_science": 0, "mmlu_high_school_european_history": 0, "mmlu_high_school_geography": 0, "mmlu_high_school_government_and_politics": 0, "mmlu_high_school_macroeconomics": 0, "mmlu_high_school_mathematics": 0, "mmlu_high_school_microeconomics": 0, "mmlu_high_school_physics": 0, "mmlu_high_school_psychology": 0, "mmlu_high_school_statistics": 0, "mmlu_high_school_us_history": 0, "mmlu_high_school_world_history": 0, "mmlu_human_aging": 0, "mmlu_human_sexuality": 0, "mmlu_international_law": 0, "mmlu_jurisprudence": 0, "mmlu_logical_fallacies": 0, "mmlu_machine_learning": 0, "mmlu_management": 0, "mmlu_marketing": 0, "mmlu_medical_genetics": 0, "mmlu_miscellaneous": 0, "mmlu_moral_disputes": 0, "mmlu_moral_scenarios": 0, "mmlu_nutrition": 0, "mmlu_philosophy": 0, "mmlu_prehistory": 0, "mmlu_professional_accounting": 0, "mmlu_professional_law": 0, "mmlu_professional_medicine": 0, "mmlu_professional_psychology": 0, "mmlu_public_relations": 0, "mmlu_security_studies": 0, "mmlu_sociology": 0, "mmlu_us_foreign_policy": 0, "mmlu_virology": 0, "mmlu_world_religions": 0 }, "higher_is_better": { "ifeval": { "prompt_level_strict_acc": true, "inst_level_strict_acc": true, "prompt_level_loose_acc": true, "inst_level_loose_acc": true }, "mmlu": { "acc": true }, "mmlu_abstract_algebra": { "acc": true }, "mmlu_anatomy": { "acc": true }, "mmlu_astronomy": { "acc": true }, "mmlu_business_ethics": { "acc": true }, "mmlu_clinical_knowledge": { "acc": true }, "mmlu_college_biology": { "acc": true }, "mmlu_college_chemistry": { "acc": true }, "mmlu_college_computer_science": { "acc": true }, "mmlu_college_mathematics": { "acc": true }, "mmlu_college_medicine": { "acc": true }, "mmlu_college_physics": { "acc": true }, "mmlu_computer_security": { "acc": true }, "mmlu_conceptual_physics": { "acc": true }, "mmlu_econometrics": { "acc": true }, "mmlu_electrical_engineering": { "acc": true }, "mmlu_elementary_mathematics": { "acc": true }, "mmlu_formal_logic": { "acc": true }, "mmlu_global_facts": { "acc": true }, "mmlu_high_school_biology": { "acc": true }, "mmlu_high_school_chemistry": { "acc": true }, "mmlu_high_school_computer_science": { "acc": true }, "mmlu_high_school_european_history": { "acc": true }, "mmlu_high_school_geography": { "acc": true }, "mmlu_high_school_government_and_politics": { "acc": true }, "mmlu_high_school_macroeconomics": { "acc": true }, "mmlu_high_school_mathematics": { "acc": true }, "mmlu_high_school_microeconomics": { "acc": true }, "mmlu_high_school_physics": { "acc": true }, "mmlu_high_school_psychology": { "acc": true }, "mmlu_high_school_statistics": { "acc": true }, "mmlu_high_school_us_history": { "acc": true }, "mmlu_high_school_world_history": { "acc": true }, "mmlu_human_aging": { "acc": true }, "mmlu_human_sexuality": { "acc": true }, "mmlu_humanities": { "acc": true }, "mmlu_international_law": { "acc": true }, "mmlu_jurisprudence": { "acc": true }, "mmlu_logical_fallacies": { "acc": true }, "mmlu_machine_learning": { "acc": true }, "mmlu_management": { "acc": true }, "mmlu_marketing": { "acc": true }, "mmlu_medical_genetics": { "acc": true }, "mmlu_miscellaneous": { "acc": true }, "mmlu_moral_disputes": { "acc": true }, "mmlu_moral_scenarios": { "acc": true }, "mmlu_nutrition": { "acc": true }, "mmlu_other": { "acc": true }, "mmlu_philosophy": { "acc": true }, "mmlu_prehistory": { "acc": true }, "mmlu_professional_accounting": { "acc": true }, "mmlu_professional_law": { "acc": true }, "mmlu_professional_medicine": { "acc": true }, "mmlu_professional_psychology": { "acc": true }, "mmlu_public_relations": { "acc": true }, "mmlu_security_studies": { "acc": true }, "mmlu_social_sciences": { "acc": true }, "mmlu_sociology": { "acc": true }, "mmlu_stem": { "acc": true }, "mmlu_us_foreign_policy": { "acc": true }, "mmlu_virology": { "acc": true }, "mmlu_world_religions": { "acc": true } }, "n-samples": { "mmlu_college_mathematics": { "original": 100, "effective": 100 }, "mmlu_college_computer_science": { "original": 100, "effective": 100 }, "mmlu_electrical_engineering": { "original": 145, "effective": 145 }, "mmlu_astronomy": { "original": 152, "effective": 152 }, "mmlu_college_biology": { "original": 144, "effective": 144 }, "mmlu_elementary_mathematics": { "original": 378, "effective": 378 }, "mmlu_high_school_computer_science": { "original": 100, "effective": 100 }, "mmlu_college_chemistry": { "original": 100, "effective": 100 }, "mmlu_anatomy": { "original": 135, "effective": 135 }, "mmlu_computer_security": { "original": 100, "effective": 100 }, "mmlu_high_school_chemistry": { "original": 203, "effective": 203 }, "mmlu_abstract_algebra": { "original": 100, "effective": 100 }, "mmlu_high_school_mathematics": { "original": 270, "effective": 270 }, "mmlu_high_school_biology": { "original": 310, "effective": 310 }, "mmlu_machine_learning": { "original": 112, "effective": 112 }, "mmlu_college_physics": { "original": 102, "effective": 102 }, "mmlu_high_school_physics": { "original": 151, "effective": 151 }, "mmlu_high_school_statistics": { "original": 216, "effective": 216 }, "mmlu_conceptual_physics": { "original": 235, "effective": 235 }, "mmlu_business_ethics": { "original": 100, "effective": 100 }, "mmlu_global_facts": { "original": 100, "effective": 100 }, "mmlu_professional_accounting": { "original": 282, "effective": 282 }, "mmlu_professional_medicine": { "original": 272, "effective": 272 }, "mmlu_miscellaneous": { "original": 783, "effective": 783 }, "mmlu_marketing": { "original": 234, "effective": 234 }, "mmlu_nutrition": { "original": 306, "effective": 306 }, "mmlu_virology": { "original": 166, "effective": 166 }, "mmlu_clinical_knowledge": { "original": 265, "effective": 265 }, "mmlu_management": { "original": 103, "effective": 103 }, "mmlu_college_medicine": { "original": 173, "effective": 173 }, "mmlu_medical_genetics": { "original": 100, "effective": 100 }, "mmlu_human_aging": { "original": 223, "effective": 223 }, "mmlu_high_school_psychology": { "original": 545, "effective": 545 }, "mmlu_econometrics": { "original": 114, "effective": 114 }, "mmlu_professional_psychology": { "original": 612, "effective": 612 }, "mmlu_high_school_microeconomics": { "original": 238, "effective": 238 }, "mmlu_security_studies": { "original": 245, "effective": 245 }, "mmlu_high_school_government_and_politics": { "original": 193, "effective": 193 }, "mmlu_human_sexuality": { "original": 131, "effective": 131 }, "mmlu_sociology": { "original": 201, "effective": 201 }, "mmlu_public_relations": { "original": 110, "effective": 110 }, "mmlu_us_foreign_policy": { "original": 100, "effective": 100 }, "mmlu_high_school_geography": { "original": 198, "effective": 198 }, "mmlu_high_school_macroeconomics": { "original": 390, "effective": 390 }, "mmlu_philosophy": { "original": 311, "effective": 311 }, "mmlu_high_school_us_history": { "original": 204, "effective": 204 }, "mmlu_logical_fallacies": { "original": 163, "effective": 163 }, "mmlu_high_school_european_history": { "original": 165, "effective": 165 }, "mmlu_high_school_world_history": { "original": 237, "effective": 237 }, "mmlu_jurisprudence": { "original": 108, "effective": 108 }, "mmlu_prehistory": { "original": 324, "effective": 324 }, "mmlu_world_religions": { "original": 171, "effective": 171 }, "mmlu_moral_disputes": { "original": 346, "effective": 346 }, "mmlu_professional_law": { "original": 1534, "effective": 1534 }, "mmlu_international_law": { "original": 121, "effective": 121 }, "mmlu_formal_logic": { "original": 126, "effective": 126 }, "mmlu_moral_scenarios": { "original": 895, "effective": 895 }, "ifeval": { "original": 541, "effective": 541 } }, "config": { "model": "vllm", "model_args": "pretrained=/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge,tokenizer=/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge,max_gen_toks=4096,max_model_len=8192,enable_prefix_caching=True,enable_chunked_prefill=True,tensor_parallel_size=2", "batch_size": "auto", "batch_sizes": [], "device": null, "use_cache": null, "limit": null, "bootstrap_iters": 100000, "gen_kwargs": null, "random_seed": 0, "numpy_seed": 1234, "torch_seed": 1234, "fewshot_seed": 1234 }, "git_hash": "c0fc7172", "date": 1761268801.2141674, "pretty_env_info": "PyTorch version: 2.8.0+cu128\nIs debug build: False\nCUDA used to build PyTorch: 12.8\nROCM used to build PyTorch: N/A\n\nOS: Ubuntu 22.04.5 LTS (x86_64)\nGCC version: (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\nClang version: Could not collect\nCMake version: version 3.22.1\nLibc version: glibc-2.35\n\nPython version: 3.12.10 (main, Apr 9 2025, 04:03:51) [Clang 20.1.0 ] (64-bit runtime)\nPython platform: Linux-5.15.0-1082-nvidia-x86_64-with-glibc2.35\nIs CUDA available: True\nCUDA runtime version: 12.8.93\nCUDA_MODULE_LOADING set to: LAZY\nGPU models and configuration: \nGPU 0: NVIDIA A100-SXM4-80GB\nGPU 1: NVIDIA A100-SXM4-80GB\nGPU 2: NVIDIA A100-SXM4-80GB\nGPU 3: NVIDIA A100-SXM4-80GB\nGPU 4: NVIDIA A100-SXM4-80GB\nGPU 5: NVIDIA A100-SXM4-80GB\nGPU 6: NVIDIA A100-SXM4-80GB\nGPU 7: NVIDIA A100-SXM4-80GB\n\nNvidia driver version: 535.247.01\ncuDNN version: Could not collect\nHIP runtime version: N/A\nMIOpen runtime version: N/A\nIs XNNPACK available: True\n\nCPU:\nArchitecture: x86_64\nCPU op-mode(s): 32-bit, 64-bit\nAddress sizes: 43 bits physical, 48 bits virtual\nByte Order: Little Endian\nCPU(s): 256\nOn-line CPU(s) list: 0-255\nVendor ID: AuthenticAMD\nModel name: AMD EPYC 7742 64-Core Processor\nCPU family: 23\nModel: 49\nThread(s) per core: 2\nCore(s) per socket: 64\nSocket(s): 2\nStepping: 0\nFrequency boost: enabled\nCPU max MHz: 2250.0000\nCPU min MHz: 1500.0000\nBogoMIPS: 4491.47\nFlags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl nonstop_tsc cpuid extd_apicid aperfmperf rapl pni pclmulqdq monitor ssse3 fma cx16 sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand lahf_lm cmp_legacy svm extapic cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw ibs skinit wdt tce topoext perfctr_core perfctr_nb bpext perfctr_llc mwaitx cpb cat_l3 cdp_l3 hw_pstate ssbd mba ibrs ibpb stibp vmmcall fsgsbase bmi1 avx2 smep bmi2 cqm rdt_a rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 xsaves cqm_llc cqm_occup_llc cqm_mbm_total cqm_mbm_local clzero irperf xsaveerptr rdpru wbnoinvd amd_ppin arat npt lbrv svm_lock nrip_save tsc_scale vmcb_clean flushbyasid decodeassists pausefilter pfthreshold avic v_vmsave_vmload vgif v_spec_ctrl umip rdpid overflow_recov succor smca sme sev sev_es\nVirtualization: AMD-V\nL1d cache: 4 MiB (128 instances)\nL1i cache: 4 MiB (128 instances)\nL2 cache: 64 MiB (128 instances)\nL3 cache: 512 MiB (32 instances)\nNUMA node(s): 8\nNUMA node0 CPU(s): 0-15,128-143\nNUMA node1 CPU(s): 16-31,144-159\nNUMA node2 CPU(s): 32-47,160-175\nNUMA node3 CPU(s): 48-63,176-191\nNUMA node4 CPU(s): 64-79,192-207\nNUMA node5 CPU(s): 80-95,208-223\nNUMA node6 CPU(s): 96-111,224-239\nNUMA node7 CPU(s): 112-127,240-255\nVulnerability Gather data sampling: Not affected\nVulnerability Indirect target selection: Not affected\nVulnerability Itlb multihit: Not affected\nVulnerability L1tf: Not affected\nVulnerability Mds: Not affected\nVulnerability Meltdown: Not affected\nVulnerability Mmio stale data: Not affected\nVulnerability Reg file data sampling: Not affected\nVulnerability Retbleed: Mitigation; untrained return thunk; SMT enabled with STIBP protection\nVulnerability Spec rstack overflow: Mitigation; safe RET\nVulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp\nVulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization\nVulnerability Spectre v2: Mitigation; Retpolines; IBPB conditional; STIBP always-on; RSB filling; PBRSB-eIBRS Not affected; BHI Not affected\nVulnerability Srbds: Not affected\nVulnerability Tsx async abort: Not affected\n\nVersions of relevant libraries:\n[pip3] numpy==2.2.6\n[pip3] nvidia-cublas-cu12==12.8.4.1\n[pip3] nvidia-cuda-cupti-cu12==12.8.90\n[pip3] nvidia-cuda-nvrtc-cu12==12.8.93\n[pip3] nvidia-cuda-runtime-cu12==12.8.90\n[pip3] nvidia-cudnn-cu12==9.10.2.21\n[pip3] nvidia-cufft-cu12==11.3.3.83\n[pip3] nvidia-curand-cu12==10.3.9.90\n[pip3] nvidia-cusolver-cu12==11.7.3.90\n[pip3] nvidia-cusparse-cu12==12.5.8.93\n[pip3] nvidia-cusparselt-cu12==0.7.1\n[pip3] nvidia-nccl-cu12==2.27.3\n[pip3] nvidia-nvjitlink-cu12==12.8.93\n[pip3] nvidia-nvtx-cu12==12.8.90\n[pip3] torch==2.8.0\n[pip3] torchaudio==2.8.0\n[pip3] torchvision==0.23.0\n[pip3] triton==3.4.0\n[conda] Could not collect", "transformers_version": "4.57.0", "lm_eval_version": "0.4.9.1", "upper_git_hash": null, "tokenizer_pad_token": [ "", "11" ], "tokenizer_eos_token": [ "", "2" ], "tokenizer_bos_token": [ "", "1" ], "eot_token_id": 2, "max_length": 8192, "task_hashes": {}, "model_source": "vllm", "model_name": "/home/dgxuser/workspace/Mango/models/mergekit/outputs/GRPo-ckpt-500-compliance-judge", "model_name_sanitized": "__home__dgxuser__workspace__Mango__models__mergekit__outputs__GRPo-ckpt-500-compliance-judge", "system_instruction": null, "system_instruction_sha": null, "fewshot_as_multiturn": false, "chat_template": "{#- Unsloth template fixes #}\n{%- set yesterday_day = strftime_now(\"%d\") %}\n{%- set yesterday_month = strftime_now(\"%m\") %}\n{%- set yesterday_year = strftime_now(\"%Y\") %}\n{%- set today_date = yesterday_year + '-' + yesterday_month + '-' + yesterday_day %}\n{%- if yesterday_day == '01' %}\n {#- Jinja doesnt allow minus 1 date - Unsloth alternative #}\n {%- if yesterday_month == '01' %}\n {%- set yesterday_day = '31' %}\n {%- set yesterday_month = '12' %}\n {%- if yesterday_year == '2024' %}\n {%- set yesterday_year = '2023' %}\n {%- elif yesterday_year == '2025' %}\n {%- set yesterday_year = '2024' %}\n {%- elif yesterday_year == '2026' %}\n {%- set yesterday_year = '2025' %}\n {%- elif yesterday_year == '2027' %}\n {%- set yesterday_year = '2026' %}\n {%- elif yesterday_year == '2028' %}\n {%- set yesterday_year = '2027' %}\n {%- elif yesterday_year == '2029' %}\n {%- set yesterday_year = '2028' %}\n {%- elif yesterday_year == '2030' %}\n {%- set yesterday_year = '2029' %}\n {%- elif yesterday_year == '2031' %}\n {%- set yesterday_year = '2030' %}\n {%- elif yesterday_year == '2032' %}\n {%- set yesterday_year = '2031' %}\n {%- elif yesterday_year == '1970' %}\n {#- Stop llama_cpp from erroring out #}\n {%- set yesterday_year = '1970' %}\n {%- else %}\n {{- raise_exception('Unsloth custom template does not support years > 2032. Error year = [' + yesterday_year + ']') }}\n {%- endif %}\n {%- elif yesterday_month == '02' %}\n {%- set yesterday_day = '31' %}\n {%- set yesterday_month = '01' %}\n {%- elif yesterday_month == '03' %}\n {%- set yesterday_month = '02' %}\n {%- set yesterday_day = '28' %}\n {%- if yesterday_year == '2024' %}\n {%- set yesterday_day = '29' %}\n {%- elif yesterday_year == '2028' %}\n {%- set yesterday_day = '29' %}\n {%- elif yesterday_year == '2032' %}\n {%- set yesterday_day = '29' %}\n {%- elif yesterday_year == '1970' %}\n {#- Stop llama_cpp from erroring out #}\n {%- set yesterday_day = '29' %}\n {%- else %}\n {{- raise_exception('Unsloth custom template does not support years > 2032. Error year = [' + yesterday_year + ']') }}\n {%- endif %}\n {%- elif yesterday_month == '04' %}\n {%- set yesterday_day = '31' %}\n {%- set yesterday_month = '03' %}\n {%- elif yesterday_month == '05' %}\n {%- set yesterday_day = '30' %}\n {%- set yesterday_month = '04' %}\n {%- elif yesterday_month == '06' %}\n {%- set yesterday_day = '31' %}\n {%- set yesterday_month = '05' %}\n {%- elif yesterday_month == '07' %}\n {%- set yesterday_day = '30' %}\n {%- set yesterday_month = '06' %}\n {%- elif yesterday_month == '08' %}\n {%- set yesterday_day = '31' %}\n {%- set yesterday_month = '07' %}\n {%- elif yesterday_month == '09' %}\n {%- set yesterday_day = '31' %}\n {%- set yesterday_month = '08' %}\n {%- elif yesterday_month == '10' %}\n {%- set yesterday_day = '30' %}\n {%- set yesterday_month = '09' %}\n {%- elif yesterday_month == '11' %}\n {%- set yesterday_day = '31' %}\n {%- set yesterday_month = '10' %}\n {%- elif yesterday_month == '12' %}\n {%- set yesterday_day = '30' %}\n {%- set yesterday_month = '11' %}\n {%- endif %}\n{%- elif yesterday_day == '02' %}\n {%- set yesterday_day = '01' %}\n{%- elif yesterday_day == '03' %}\n {%- set yesterday_day = '02' %}\n{%- elif yesterday_day == '04' %}\n {%- set yesterday_day = '03' %}\n{%- elif yesterday_day == '05' %}\n {%- set yesterday_day = '04' %}\n{%- elif yesterday_day == '06' %}\n {%- set yesterday_day = '05' %}\n{%- elif yesterday_day == '07' %}\n {%- set yesterday_day = '06' %}\n{%- elif yesterday_day == '08' %}\n {%- set yesterday_day = '07' %}\n{%- elif yesterday_day == '09' %}\n {%- set yesterday_day = '08' %}\n{%- elif yesterday_day == '10' %}\n {%- set yesterday_day = '09' %}\n{%- elif yesterday_day == '11' %}\n {%- set yesterday_day = '10' %}\n{%- elif yesterday_day == '12' %}\n {%- set yesterday_day = '11' %}\n{%- elif yesterday_day == '13' %}\n {%- set yesterday_day = '12' %}\n{%- elif yesterday_day == '14' %}\n {%- set yesterday_day = '13' %}\n{%- elif yesterday_day == '15' %}\n {%- set yesterday_day = '14' %}\n{%- elif yesterday_day == '16' %}\n {%- set yesterday_day = '15' %}\n{%- elif yesterday_day == '17' %}\n {%- set yesterday_day = '16' %}\n{%- elif yesterday_day == '18' %}\n {%- set yesterday_day = '17' %}\n{%- elif yesterday_day == '19' %}\n {%- set yesterday_day = '18' %}\n{%- elif yesterday_day == '20' %}\n {%- set yesterday_day = '19' %}\n{%- elif yesterday_day == '21' %}\n {%- set yesterday_day = '20' %}\n{%- elif yesterday_day == '22' %}\n {%- set yesterday_day = '21' %}\n{%- elif yesterday_day == '23' %}\n {%- set yesterday_day = '22' %}\n{%- elif yesterday_day == '24' %}\n {%- set yesterday_day = '23' %}\n{%- elif yesterday_day == '25' %}\n {%- set yesterday_day = '24' %}\n{%- elif yesterday_day == '26' %}\n {%- set yesterday_day = '25' %}\n{%- elif yesterday_day == '27' %}\n {%- set yesterday_day = '26' %}\n{%- elif yesterday_day == '28' %}\n {%- set yesterday_day = '27' %}\n{%- elif yesterday_day == '29' %}\n {%- set yesterday_day = '28' %}\n{%- elif yesterday_day == '30' %}\n {%- set yesterday_day = '29' %}\n{%- elif yesterday_day == '31' %}\n {%- set yesterday_day = '30' %}\n{%- endif %}\n{#- Edits made by Unsloth #}\n{%- set yesterday_date = yesterday_year + '-' + yesterday_month + '-' + yesterday_day %}\n{%- set default_system_message = \"You are Mistral-Small-3.2-24B-Instruct-2506, a Large Language Model (LLM) created by Mistral AI, a French startup headquartered in Paris.\\nYou power an AI assistant called Le Chat.\\nYour knowledge base was last updated on 2023-10-01.\\nThe current date is \" + today_date + \".\\n\\nWhen you\\'re not sure about some information or when the user\\'s request requires up-to-date or specific data, you must use the available tools to fetch the information. Do not hesitate to use tools whenever they can provide a more accurate or complete response. If no relevant tools are available, then clearly state that you don\\'t have the information and avoid making up anything.\\nIf the user\\'s question is not clear, ambiguous, or does not provide enough context for you to accurately answer the question, you do not try to answer it right away and you rather ask the user to clarify their request (e.g. \\\"What are some good restaurants around me?\\\" => \\\"Where are you?\\\" or \\\"When is the next flight to Tokyo\\\" => \\\"Where do you travel from?\\\").\\nYou are always very attentive to dates, in particular you try to resolve dates (e.g. \\\"yesterday\\\" is \" + yesterday_date + \") and when asked about information at specific dates, you discard information that is at another date.\\nYou follow these instructions in all languages, and always respond to the user in the language they use or request.\\nNext sections describe the capabilities that you have.\\n\\n# WEB BROWSING INSTRUCTIONS\\n\\nYou cannot perform any web search or access internet to open URLs, links etc. If it seems like the user is expecting you to do so, you clarify the situation and ask the user to copy paste the text directly in the chat.\\n\\n# MULTI-MODAL INSTRUCTIONS\\n\\nYou have the ability to read images, but you cannot generate images. You also cannot transcribe audio files or videos.\\nYou cannot read nor transcribe audio files or videos.\\n\\n# TOOL CALLING INSTRUCTIONS\\n\\nYou may have access to tools that you can use to fetch information or perform actions. You must use these tools in the following situations:\\n\\n1. When the request requires up-to-date information.\\n2. When the request requires specific data that you do not have in your knowledge base.\\n3. When the request involves actions that you cannot perform without tools.\\n\\nAlways prioritize using tools to provide the most accurate and helpful response. If tools are not available, inform the user that you cannot perform the requested action at the moment.\" %}\n\n{{- bos_token }}\n\n{%- if messages[0]['role'] == 'system' %}\n {%- if messages[0]['content'] is string %}\n {%- set system_message = messages[0]['content'] %}\n {%- elif messages[0]['content'] is iterable %}\n {%- set system_message = messages[0]['content'][0]['text'] %}\n {%- else %}\n {%- set system_message = messages[0]['content']|string %}\n {%- endif %}\n {%- set loop_messages = messages[1:] %}\n{%- else %}\n {%- set system_message = default_system_message %}\n {%- set loop_messages = messages %}\n{%- endif %}\n{{- '[SYSTEM_PROMPT]' + system_message + '[/SYSTEM_PROMPT]' }}\n\n\n{#- Tool description appended ONLY to last user message. Edits made by Unsloth #}\n{#- Tool description appended also if last message is tool. Edits made by Unsloth #}\n{%- set tools_description = \"\" %}\n{%- set has_tools = false %}\n{#- Cannot use set append_tools_index = loop.index0 since temporary variable - must use namespace #}\n{%- set ns = namespace(append_tools_index=0, append_tools=false) %}\n\n{%- if tools is defined and tools is not none and tools|length > 0 %}\n\n {%- set has_tools = true %}\n {%- set tools_description = \"[AVAILABLE_TOOLS]\" + (tools | tojson) + \"[/AVAILABLE_TOOLS]\" %}\n\n {#- If User,Assistant,Tool,Tool we also need to append tools_description to last assistant WITHOUT tool_calls defined. Edits made by Unsloth #}\n {%- if (loop_messages|last)['role'] == 'tool' %}\n\n {#- Find last assistant WITHOUT tool_calls defined #}\n {%- set ns.append_tools = true %}\n {%- for message in loop_messages %}\n {%- if message['role'] == 'assistant' %}\n\n {#- Cannot use set append_tools_index = loop.index0 since temporary variable - must use namespace #}\n\n {%- if message['tool_calls'] is not defined or message['tool_calls'] is none %}\n {%- set ns.append_tools_index = loop.index0 %}\n {%- endif %}\n\n {%- endif %}\n {%- endfor %}\n\n {%- endif %}\n\n{%- endif %}\n\n{%- for message in loop_messages %}\n {%- if message['role'] == 'user' %}\n\n {%- if has_tools and loop.last %}\n {{- tools_description }}\n {%- endif %}\n\n {#- If directly called tools in first turn, prepend to user #}\n {%- if ns.append_tools and ns.append_tools_index == 0 %}\n {{- tools_description }}\n {%- endif %}\n\n {%- if message['content'] is string %}\n {{- '[INST]' + message['content'] + '[/INST]' }}\n {%- else %}\n {{- '[INST]' }}\n {%- for block in message['content'] %}\n {%- if block['type'] == 'text' %}\n\n {#- Original did not have content which is weird. Added by Un-sloth. #}\n {%- if block['text'] is defined %}\n {{- block['text'] }}\n {%- else %}\n {{- block['content'] }}\n {%- endif %}\n\n {%- elif block['type'] in ['image', 'image_url'] %}\n {{- '[IMG]' }}\n {%- else %}\n {{- raise_exception('Only text and image blocks are supported in message content!') }}\n {%- endif %}\n {%- endfor %}\n {{- '[/INST]' }}\n {%- endif %}\n\n {%- elif message['role'] == 'system' %}\n {%- if message['content'] is string %}\n {{- '[SYSTEM_PROMPT]' + message['content'] + '[/SYSTEM_PROMPT]' }}\n {%- elif message['content'] is iterable %}\n {{- '[SYSTEM_PROMPT]' + message['content'][0]['text'] + '[/SYSTEM_PROMPT]' }}\n {%- else %}\n {{- '[SYSTEM_PROMPT]' + message['content']|string + '[/SYSTEM_PROMPT]' }}\n {%- endif %}\n \n\n {%- elif message['role'] == 'assistant' %}\n {%- if message['content'] is string %}\n {{- message['content'] }}\n {%- elif message['content'] is iterable %}\n {{- message['content'][0]['text'] }}\n {%- else %}\n {{- message['content']|string }}\n {%- endif %}\n\n {#- If User,Assistant,Tool,Tool we also need to append tools_description. Edits made by Unsloth #}\n\n {%- if has_tools and (loop.index0 == ns.append_tools_index) %}\n {{- eos_token }}\n {{- tools_description }}\n {%- endif %}\n\n {%- if message['tool_calls'] is defined and message['tool_calls'] is not none %}\n {%- for tool in message['tool_calls'] %}\n {%- if tool['id'] is not defined %}\n {{- raise_exception('Tool ID must be provided!') }}\n {%- endif %}\n {%- set tool_call_id = tool['id'] %}\n {%- if tool_call_id is not string or tool_call_id|length < 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length >= 9!\") }}\n {%- endif %}\n {%- set arguments = tool['function']['arguments'] %}\n {%- if arguments is not string %}\n {%- set arguments = arguments|tojson %}\n {%- endif %}\n {#- Must list tool calls AFTER assistant. Edits made by Un-sloth #}\n {{- \"[TOOL_CALLS]\" + tool['function']['name'] + \"[CALL_ID]\" + tool_call_id + \"[ARGS]\" + arguments }}\n {%- endfor %}\n {%- endif %}\n\n {#- Must not add EOS if added tools_description. Unsloth edits. #}\n {%- if (loop.index0 != ns.append_tools_index) %}\n {{- eos_token }}\n {%- endif %}\n\n {%- elif message[\"role\"] == \"tool_results\" or message[\"role\"] == \"tool\" %}\n {%- if message.content is defined and message.content.content is defined %}\n {%- set content = message.content.content %}\n {%- else %}\n {%- set content = message.content %}\n {%- endif %}\n {%- if message['tool_call_id'] is not defined %}\n {{- raise_exception('tool_call_id must be provided!') }}\n {%- endif %}\n {%- set tool_call_id = message['tool_call_id'] %}\n {%- if tool_call_id is not string or tool_call_id|length < 9 %}\n {{- raise_exception(\"Tool call IDs should be alphanumeric strings with length >= 9!\") }}\n {%- endif %}\n {{- \"[TOOL_RESULTS]\" + tool_call_id + \"[TOOL_CONTENT]\" + content|string + \"[/TOOL_RESULTS]\" }}\n\n {%- else %}\n {{- raise_exception('Only user, systemm assistant and tool roles are supported in the custom template made by Unsloth!') }}\n {%- endif %}\n{%- endfor %}\n{#- Copyright 2025-present Unsloth. Apache 2.0 License. #}", "chat_template_sha": "a126eea7996a0ac9540b2fc7fb4af1dad975d3b388c194c713921f2b4205aa19", "start_time": 284269.304876447, "end_time": 285489.71091494, "total_evaluation_time_seconds": "1220.4060384929762" }