Index of /helm/benchmark_output/releases/v1.1.0-canary/groups/json

[ICO]NameLast modifiedSizeDescription

[DIR]Parent Directory  -  
[   ]core_scenarios_general_information.json14-Feb-2024 14:13 811K 
[   ]targeted_evaluations_general_information.json09-Jan-2024 17:38 644K 
[   ]targeted_evaluations_efficiency_detailed.json09-Jan-2024 17:38 512K 
[   ]math_chain_of_thought_math_chain_of_thought.json14-Feb-2024 14:13 280K 
[   ]reasoning_general_information.json09-Jan-2024 17:38 269K 
[   ]question_answering_general_information.json09-Jan-2024 17:38 246K 
[   ]core_scenarios_bias.json09-Jan-2024 17:38 179K 
[   ]mmlu_mmlu.json14-Feb-2024 14:13 177K 
[   ]core_scenarios_efficiency.json14-Feb-2024 14:13 176K 
[   ]core_scenarios_accuracy.json14-Feb-2024 14:13 174K 
[   ]knowledge_general_information.json09-Jan-2024 17:38 161K 
[   ]legalbench_legalbench.json14-Feb-2024 14:13 154K 
[   ]wmt_14_wmt_14.json14-Feb-2024 14:13 139K 
[   ]calibration_calibration_detailed.json09-Jan-2024 17:38 134K 
[   ]targeted_evaluations_bias.json09-Jan-2024 17:38 105K 
[   ]targeted_evaluations_accuracy.json09-Jan-2024 17:38 92K 
[   ]question_answering_bias.json09-Jan-2024 17:38 83K 
[   ]core_scenarios_robustness.json09-Jan-2024 17:38 68K 
[   ]core_scenarios_fairness.json09-Jan-2024 17:38 68K 
[   ]math_chain_of_thought_math_chain_of_thought_subject:counting_and_probability,level:1,use_official_examples:False,use_chain_of_thought:True.json14-Feb-2024 14:13 67K 
[   ]math_chain_of_thought_math_chain_of_thought_subject:intermediate_algebra,level:1,use_official_examples:False,use_chain_of_thought:True.json14-Feb-2024 14:13 67K 
[   ]math_chain_of_thought_math_chain_of_thought_subject:geometry,level:1,use_official_examples:False,use_chain_of_thought:True.json14-Feb-2024 14:13 66K 
[   ]math_chain_of_thought_math_chain_of_thought_subject:prealgebra,level:1,use_official_examples:False,use_chain_of_thought:True.json14-Feb-2024 14:13 65K 
[   ]math_chain_of_thought_math_chain_of_thought_subject:precalculus,level:1,use_official_examples:False,use_chain_of_thought:True.json14-Feb-2024 14:13 65K 
[   ]math_chain_of_thought_math_chain_of_thought_subject:number_theory,level:1,use_official_examples:False,use_chain_of_thought:True.json14-Feb-2024 14:13 65K 
[   ]math_chain_of_thought_math_chain_of_thought_subject:algebra,level:1,use_official_examples:False,use_chain_of_thought:True.json14-Feb-2024 14:13 65K 
[   ]core_scenarios_summarization_metrics.json09-Jan-2024 17:38 62K 
[   ]natural_qa_openbook_longans_natural_qa_openbook_longans_mode:openbook_longans.json14-Feb-2024 14:13 59K 
[   ]legalbench_legalbench_subset:corporate_lobbying.json14-Feb-2024 14:13 58K 
[   ]legalbench_legalbench_subset:function_of_decision_section.json14-Feb-2024 14:13 58K 
[   ]legalbench_legalbench_subset:international_citizenship_questions.json14-Feb-2024 14:13 58K 
[   ]mmlu_mmlu_subject:econometrics.json14-Feb-2024 14:13 58K 
[   ]mmlu_mmlu_subject:us_foreign_policy.json14-Feb-2024 14:13 58K 
[   ]mmlu_mmlu_subject:computer_security.json14-Feb-2024 14:13 58K 
[   ]mmlu_mmlu_subject:college_chemistry.json14-Feb-2024 14:13 58K 
[   ]mmlu_mmlu_subject:abstract_algebra.json14-Feb-2024 14:13 58K 
[   ]wmt_14_wmt_14_source_language:hi,target_language:en.json14-Feb-2024 14:13 57K 
[   ]natural_qa_closedbook_natural_qa_closedbook_mode:closedbook.json14-Feb-2024 14:13 57K 
[   ]wmt_14_wmt_14_source_language:de,target_language:en.json14-Feb-2024 14:13 57K 
[   ]wmt_14_wmt_14_source_language:ru,target_language:en.json14-Feb-2024 14:13 57K 
[   ]wmt_14_wmt_14_source_language:fr,target_language:en.json14-Feb-2024 14:13 57K 
[   ]wmt_14_wmt_14_source_language:cs,target_language:en.json14-Feb-2024 14:13 57K 
[   ]openbookqa_openbookqa_.json14-Feb-2024 14:13 57K 
[   ]legalbench_legalbench_subset:abercrombie.json14-Feb-2024 14:13 56K 
[   ]narrative_qa_narrative_qa_.json14-Feb-2024 14:13 55K 
[   ]legalbench_legalbench_subset:proa.json14-Feb-2024 14:13 55K 
[   ]core_scenarios_calibration.json09-Jan-2024 17:38 55K 
[   ]med_qa_med_qa_.json14-Feb-2024 14:13 54K 
[   ]gsm_gsm_.json14-Feb-2024 14:13 53K 
[   ]question_answering_robustness.json09-Jan-2024 17:38 52K 
[   ]question_answering_fairness.json09-Jan-2024 17:38 51K 
[   ]question_answering_accuracy.json09-Jan-2024 17:38 50K 
[   ]reasoning_efficiency.json09-Jan-2024 17:38 49K 
[   ]reasoning_accuracy.json09-Jan-2024 17:38 49K 
[   ]core_scenarios_toxicity.json09-Jan-2024 17:38 45K 
[   ]question_answering_calibration.json09-Jan-2024 17:38 45K 
[   ]question_answering_efficiency.json09-Jan-2024 17:38 40K 
[   ]knowledge_accuracy.json09-Jan-2024 17:38 35K 
[   ]targeted_evaluations_robustness.json09-Jan-2024 17:38 33K 
[   ]knowledge_robustness.json09-Jan-2024 17:38 32K 
[   ]targeted_evaluations_fairness.json09-Jan-2024 17:38 32K 
[   ]knowledge_fairness.json09-Jan-2024 17:38 32K 
[   ]targeted_evaluations_calibration.json09-Jan-2024 17:38 28K 
[   ]knowledge_calibration.json09-Jan-2024 17:38 28K 
[   ]targeted_evaluations_toxicity.json09-Jan-2024 17:38 28K 
[   ]knowledge_efficiency.json09-Jan-2024 17:38 28K 
[   ]targeted_evaluations_copyright_metrics.json09-Jan-2024 17:38 24K 
[   ]question_answering_toxicity.json09-Jan-2024 17:38 23K 
[   ]knowledge_bias.json09-Jan-2024 17:38 23K 
[   ]calibration_accuracy.json09-Jan-2024 17:38 22K 
[   ]targeted_evaluations_disinformation_metrics.json09-Jan-2024 17:38 18K 
[   ]targeted_evaluations_bbq_metrics.json09-Jan-2024 17:38 12K 
[   ]targeted_evaluations_apps_metrics.json09-Jan-2024 17:38 12K 
[   ]reasoning_apps_metrics.json09-Jan-2024 17:38 12K 
[   ]knowledge_toxicity.json09-Jan-2024 17:38 9.6K 

Apache/2.2.15 (CentOS) Server at nlp.stanford.edu Port 443