Index of /helm/benchmark_output/releases/v1.1.0-canary/groups/json

Name	Last modified	Size

Parent Directory		-
core_scenarios_general_information.json	14-Feb-2024 14:13	811K
targeted_evaluations_general_information.json	09-Jan-2024 17:38	644K
targeted_evaluations_efficiency_detailed.json	09-Jan-2024 17:38	512K
math_chain_of_thought_math_chain_of_thought.json	14-Feb-2024 14:13	280K
reasoning_general_information.json	09-Jan-2024 17:38	269K
question_answering_general_information.json	09-Jan-2024 17:38	246K
core_scenarios_bias.json	09-Jan-2024 17:38	179K
mmlu_mmlu.json	14-Feb-2024 14:13	177K
core_scenarios_efficiency.json	14-Feb-2024 14:13	176K
core_scenarios_accuracy.json	14-Feb-2024 14:13	174K
knowledge_general_information.json	09-Jan-2024 17:38	161K
legalbench_legalbench.json	14-Feb-2024 14:13	154K
wmt_14_wmt_14.json	14-Feb-2024 14:13	139K
calibration_calibration_detailed.json	09-Jan-2024 17:38	134K
targeted_evaluations_bias.json	09-Jan-2024 17:38	105K
targeted_evaluations_accuracy.json	09-Jan-2024 17:38	92K
question_answering_bias.json	09-Jan-2024 17:38	83K
core_scenarios_robustness.json	09-Jan-2024 17:38	68K
core_scenarios_fairness.json	09-Jan-2024 17:38	68K
math_chain_of_thought_math_chain_of_thought_subject:counting_and_probability,level:1,use_official_examples:False,use_chain_of_thought:True.json	14-Feb-2024 14:13	67K
math_chain_of_thought_math_chain_of_thought_subject:intermediate_algebra,level:1,use_official_examples:False,use_chain_of_thought:True.json	14-Feb-2024 14:13	67K
math_chain_of_thought_math_chain_of_thought_subject:geometry,level:1,use_official_examples:False,use_chain_of_thought:True.json	14-Feb-2024 14:13	66K
math_chain_of_thought_math_chain_of_thought_subject:prealgebra,level:1,use_official_examples:False,use_chain_of_thought:True.json	14-Feb-2024 14:13	65K
math_chain_of_thought_math_chain_of_thought_subject:precalculus,level:1,use_official_examples:False,use_chain_of_thought:True.json	14-Feb-2024 14:13	65K
math_chain_of_thought_math_chain_of_thought_subject:number_theory,level:1,use_official_examples:False,use_chain_of_thought:True.json	14-Feb-2024 14:13	65K
math_chain_of_thought_math_chain_of_thought_subject:algebra,level:1,use_official_examples:False,use_chain_of_thought:True.json	14-Feb-2024 14:13	65K
core_scenarios_summarization_metrics.json	09-Jan-2024 17:38	62K
natural_qa_openbook_longans_natural_qa_openbook_longans_mode:openbook_longans.json	14-Feb-2024 14:13	59K
legalbench_legalbench_subset:corporate_lobbying.json	14-Feb-2024 14:13	58K
legalbench_legalbench_subset:function_of_decision_section.json	14-Feb-2024 14:13	58K
legalbench_legalbench_subset:international_citizenship_questions.json	14-Feb-2024 14:13	58K
mmlu_mmlu_subject:econometrics.json	14-Feb-2024 14:13	58K
mmlu_mmlu_subject:us_foreign_policy.json	14-Feb-2024 14:13	58K
mmlu_mmlu_subject:computer_security.json	14-Feb-2024 14:13	58K
mmlu_mmlu_subject:college_chemistry.json	14-Feb-2024 14:13	58K
mmlu_mmlu_subject:abstract_algebra.json	14-Feb-2024 14:13	58K
wmt_14_wmt_14_source_language:hi,target_language:en.json	14-Feb-2024 14:13	57K
natural_qa_closedbook_natural_qa_closedbook_mode:closedbook.json	14-Feb-2024 14:13	57K
wmt_14_wmt_14_source_language:de,target_language:en.json	14-Feb-2024 14:13	57K
wmt_14_wmt_14_source_language:ru,target_language:en.json	14-Feb-2024 14:13	57K
wmt_14_wmt_14_source_language:fr,target_language:en.json	14-Feb-2024 14:13	57K
wmt_14_wmt_14_source_language:cs,target_language:en.json	14-Feb-2024 14:13	57K
openbookqa_openbookqa_.json	14-Feb-2024 14:13	57K
legalbench_legalbench_subset:abercrombie.json	14-Feb-2024 14:13	56K
narrative_qa_narrative_qa_.json	14-Feb-2024 14:13	55K
legalbench_legalbench_subset:proa.json	14-Feb-2024 14:13	55K
core_scenarios_calibration.json	09-Jan-2024 17:38	55K
med_qa_med_qa_.json	14-Feb-2024 14:13	54K
gsm_gsm_.json	14-Feb-2024 14:13	53K
question_answering_robustness.json	09-Jan-2024 17:38	52K
question_answering_fairness.json	09-Jan-2024 17:38	51K
question_answering_accuracy.json	09-Jan-2024 17:38	50K
reasoning_efficiency.json	09-Jan-2024 17:38	49K
reasoning_accuracy.json	09-Jan-2024 17:38	49K
core_scenarios_toxicity.json	09-Jan-2024 17:38	45K
question_answering_calibration.json	09-Jan-2024 17:38	45K
question_answering_efficiency.json	09-Jan-2024 17:38	40K
knowledge_accuracy.json	09-Jan-2024 17:38	35K
targeted_evaluations_robustness.json	09-Jan-2024 17:38	33K
knowledge_robustness.json	09-Jan-2024 17:38	32K
targeted_evaluations_fairness.json	09-Jan-2024 17:38	32K
knowledge_fairness.json	09-Jan-2024 17:38	32K
targeted_evaluations_calibration.json	09-Jan-2024 17:38	28K
knowledge_calibration.json	09-Jan-2024 17:38	28K
targeted_evaluations_toxicity.json	09-Jan-2024 17:38	28K
knowledge_efficiency.json	09-Jan-2024 17:38	28K
targeted_evaluations_copyright_metrics.json	09-Jan-2024 17:38	24K
question_answering_toxicity.json	09-Jan-2024 17:38	23K
knowledge_bias.json	09-Jan-2024 17:38	23K
calibration_accuracy.json	09-Jan-2024 17:38	22K
targeted_evaluations_disinformation_metrics.json	09-Jan-2024 17:38	18K
targeted_evaluations_bbq_metrics.json	09-Jan-2024 17:38	12K
targeted_evaluations_apps_metrics.json	09-Jan-2024 17:38	12K
reasoning_apps_metrics.json	09-Jan-2024 17:38	12K
knowledge_toxicity.json	09-Jan-2024 17:38	9.6K

Apache/2.2.15 (CentOS) Server at nlp.stanford.edu Port 443