[
  {
    "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-6b,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "abstract_algebra"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/yi-6b",
      "model": "01-ai/yi-6b",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_abstract_algebra"
    ]
  },
  {
    "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-2.1,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "abstract_algebra"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "\n\nHuman:",
      "global_suffix": "\n\nAssistant:",
      "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-2.1",
      "model": "anthropic/claude-2.1",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_abstract_algebra"
    ]
  },
  {
    "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "abstract_algebra"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-3-opus-20240229",
      "model": "anthropic/claude-3-opus-20240229",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_abstract_algebra"
    ]
  },
  {
    "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "abstract_algebra"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-3-sonnet-20240229",
      "model": "anthropic/claude-3-sonnet-20240229",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_abstract_algebra"
    ]
  },
  {
    "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-instant-1.2,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "abstract_algebra"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "\n\nHuman:",
      "global_suffix": "\n\nAssistant:",
      "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-instant-1.2",
      "model": "anthropic/claude-instant-1.2",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_abstract_algebra"
    ]
  },
  {
    "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-pro,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "abstract_algebra"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "google/gemini-pro",
      "model": "google/gemini-pro",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_abstract_algebra"
    ]
  },
  {
    "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "abstract_algebra"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/gemma-7b",
      "model": "google/gemma-7b",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_abstract_algebra"
    ]
  },
  {
    "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-7b-it,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "abstract_algebra"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/gemma-7b-it",
      "model": "google/gemma-7b-it",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_abstract_algebra"
    ]
  },
  {
    "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_text-bison@001,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "abstract_algebra"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "google/text-bison@001",
      "model": "google/text-bison@001",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_abstract_algebra"
    ]
  },
  {
    "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_text-unicorn@001,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "abstract_algebra"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "google/text-unicorn@001",
      "model": "google/text-unicorn@001",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_abstract_algebra"
    ]
  },
  {
    "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "abstract_algebra"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/llama-2-7b",
      "model": "meta/llama-2-7b",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_abstract_algebra"
    ]
  },
  {
    "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-2,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "abstract_algebra"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/phi-2",
      "model": "microsoft/phi-2",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_abstract_algebra"
    ]
  },
  {
    "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "abstract_algebra"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/mixtral-8x7b-32kseqlen",
      "model": "mistralai/mixtral-8x7b-32kseqlen",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_abstract_algebra"
    ]
  },
  {
    "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "abstract_algebra"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "openai/gpt-3.5-turbo-0613",
      "model": "openai/gpt-3.5-turbo-0613",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_abstract_algebra"
    ]
  },
  {
    "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-1106-preview,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "abstract_algebra"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "openai/gpt-4-1106-preview",
      "model": "openai/gpt-4-1106-preview",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_abstract_algebra"
    ]
  },
  {
    "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "abstract_algebra"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/qwen1.5-7b",
      "model": "qwen/qwen1.5-7b",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_abstract_algebra"
    ]
  },
  {
    "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-6b,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "college_chemistry"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/yi-6b",
      "model": "01-ai/yi-6b",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_college_chemistry"
    ]
  },
  {
    "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-2.1,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "college_chemistry"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "\n\nHuman:",
      "global_suffix": "\n\nAssistant:",
      "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-2.1",
      "model": "anthropic/claude-2.1",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_college_chemistry"
    ]
  },
  {
    "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "college_chemistry"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-3-opus-20240229",
      "model": "anthropic/claude-3-opus-20240229",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_college_chemistry"
    ]
  },
  {
    "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "college_chemistry"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-3-sonnet-20240229",
      "model": "anthropic/claude-3-sonnet-20240229",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_college_chemistry"
    ]
  },
  {
    "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-instant-1.2,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "college_chemistry"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "\n\nHuman:",
      "global_suffix": "\n\nAssistant:",
      "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-instant-1.2",
      "model": "anthropic/claude-instant-1.2",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_college_chemistry"
    ]
  },
  {
    "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-pro,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "college_chemistry"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "google/gemini-pro",
      "model": "google/gemini-pro",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_college_chemistry"
    ]
  },
  {
    "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "college_chemistry"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/gemma-7b",
      "model": "google/gemma-7b",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_college_chemistry"
    ]
  },
  {
    "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-7b-it,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "college_chemistry"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/gemma-7b-it",
      "model": "google/gemma-7b-it",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_college_chemistry"
    ]
  },
  {
    "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_text-bison@001,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "college_chemistry"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "google/text-bison@001",
      "model": "google/text-bison@001",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_college_chemistry"
    ]
  },
  {
    "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_text-unicorn@001,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "college_chemistry"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "google/text-unicorn@001",
      "model": "google/text-unicorn@001",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_college_chemistry"
    ]
  },
  {
    "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "college_chemistry"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/llama-2-7b",
      "model": "meta/llama-2-7b",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_college_chemistry"
    ]
  },
  {
    "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-2,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "college_chemistry"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/phi-2",
      "model": "microsoft/phi-2",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_college_chemistry"
    ]
  },
  {
    "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "college_chemistry"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/mixtral-8x7b-32kseqlen",
      "model": "mistralai/mixtral-8x7b-32kseqlen",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_college_chemistry"
    ]
  },
  {
    "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "college_chemistry"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "openai/gpt-3.5-turbo-0613",
      "model": "openai/gpt-3.5-turbo-0613",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_college_chemistry"
    ]
  },
  {
    "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-1106-preview,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "college_chemistry"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "openai/gpt-4-1106-preview",
      "model": "openai/gpt-4-1106-preview",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_college_chemistry"
    ]
  },
  {
    "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "college_chemistry"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/qwen1.5-7b",
      "model": "qwen/qwen1.5-7b",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_college_chemistry"
    ]
  },
  {
    "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-6b,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "computer_security"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/yi-6b",
      "model": "01-ai/yi-6b",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_computer_security"
    ]
  },
  {
    "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-2.1,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "computer_security"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "\n\nHuman:",
      "global_suffix": "\n\nAssistant:",
      "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-2.1",
      "model": "anthropic/claude-2.1",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_computer_security"
    ]
  },
  {
    "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "computer_security"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-3-opus-20240229",
      "model": "anthropic/claude-3-opus-20240229",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_computer_security"
    ]
  },
  {
    "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "computer_security"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-3-sonnet-20240229",
      "model": "anthropic/claude-3-sonnet-20240229",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_computer_security"
    ]
  },
  {
    "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-instant-1.2,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "computer_security"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "\n\nHuman:",
      "global_suffix": "\n\nAssistant:",
      "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-instant-1.2",
      "model": "anthropic/claude-instant-1.2",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_computer_security"
    ]
  },
  {
    "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-pro,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "computer_security"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "google/gemini-pro",
      "model": "google/gemini-pro",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_computer_security"
    ]
  },
  {
    "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "computer_security"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/gemma-7b",
      "model": "google/gemma-7b",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_computer_security"
    ]
  },
  {
    "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-7b-it,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "computer_security"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/gemma-7b-it",
      "model": "google/gemma-7b-it",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_computer_security"
    ]
  },
  {
    "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_text-bison@001,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "computer_security"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "google/text-bison@001",
      "model": "google/text-bison@001",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_computer_security"
    ]
  },
  {
    "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_text-unicorn@001,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "computer_security"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "google/text-unicorn@001",
      "model": "google/text-unicorn@001",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_computer_security"
    ]
  },
  {
    "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "computer_security"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/llama-2-7b",
      "model": "meta/llama-2-7b",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_computer_security"
    ]
  },
  {
    "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-2,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "computer_security"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/phi-2",
      "model": "microsoft/phi-2",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_computer_security"
    ]
  },
  {
    "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "computer_security"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/mixtral-8x7b-32kseqlen",
      "model": "mistralai/mixtral-8x7b-32kseqlen",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_computer_security"
    ]
  },
  {
    "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "computer_security"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "openai/gpt-3.5-turbo-0613",
      "model": "openai/gpt-3.5-turbo-0613",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_computer_security"
    ]
  },
  {
    "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-1106-preview,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "computer_security"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "openai/gpt-4-1106-preview",
      "model": "openai/gpt-4-1106-preview",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_computer_security"
    ]
  },
  {
    "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "computer_security"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/qwen1.5-7b",
      "model": "qwen/qwen1.5-7b",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_computer_security"
    ]
  },
  {
    "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-6b,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "econometrics"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/yi-6b",
      "model": "01-ai/yi-6b",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_econometrics"
    ]
  },
  {
    "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-2.1,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "econometrics"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "\n\nHuman:",
      "global_suffix": "\n\nAssistant:",
      "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-2.1",
      "model": "anthropic/claude-2.1",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_econometrics"
    ]
  },
  {
    "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "econometrics"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-3-opus-20240229",
      "model": "anthropic/claude-3-opus-20240229",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_econometrics"
    ]
  },
  {
    "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "econometrics"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-3-sonnet-20240229",
      "model": "anthropic/claude-3-sonnet-20240229",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_econometrics"
    ]
  },
  {
    "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-instant-1.2,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "econometrics"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "\n\nHuman:",
      "global_suffix": "\n\nAssistant:",
      "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-instant-1.2",
      "model": "anthropic/claude-instant-1.2",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_econometrics"
    ]
  },
  {
    "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-pro,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "econometrics"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "google/gemini-pro",
      "model": "google/gemini-pro",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_econometrics"
    ]
  },
  {
    "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "econometrics"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/gemma-7b",
      "model": "google/gemma-7b",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_econometrics"
    ]
  },
  {
    "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-7b-it,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "econometrics"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/gemma-7b-it",
      "model": "google/gemma-7b-it",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_econometrics"
    ]
  },
  {
    "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_text-bison@001,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "econometrics"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "google/text-bison@001",
      "model": "google/text-bison@001",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_econometrics"
    ]
  },
  {
    "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_text-unicorn@001,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "econometrics"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "google/text-unicorn@001",
      "model": "google/text-unicorn@001",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_econometrics"
    ]
  },
  {
    "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "econometrics"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/llama-2-7b",
      "model": "meta/llama-2-7b",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_econometrics"
    ]
  },
  {
    "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-2,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "econometrics"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/phi-2",
      "model": "microsoft/phi-2",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_econometrics"
    ]
  },
  {
    "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "econometrics"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/mixtral-8x7b-32kseqlen",
      "model": "mistralai/mixtral-8x7b-32kseqlen",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_econometrics"
    ]
  },
  {
    "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "econometrics"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "openai/gpt-3.5-turbo-0613",
      "model": "openai/gpt-3.5-turbo-0613",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_econometrics"
    ]
  },
  {
    "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-1106-preview,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "econometrics"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "openai/gpt-4-1106-preview",
      "model": "openai/gpt-4-1106-preview",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_econometrics"
    ]
  },
  {
    "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "econometrics"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/qwen1.5-7b",
      "model": "qwen/qwen1.5-7b",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_econometrics"
    ]
  },
  {
    "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-6b,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "us_foreign_policy"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/yi-6b",
      "model": "01-ai/yi-6b",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_us_foreign_policy"
    ]
  },
  {
    "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-2.1,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "us_foreign_policy"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "\n\nHuman:",
      "global_suffix": "\n\nAssistant:",
      "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-2.1",
      "model": "anthropic/claude-2.1",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_us_foreign_policy"
    ]
  },
  {
    "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "us_foreign_policy"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-3-opus-20240229",
      "model": "anthropic/claude-3-opus-20240229",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_us_foreign_policy"
    ]
  },
  {
    "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "us_foreign_policy"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-3-sonnet-20240229",
      "model": "anthropic/claude-3-sonnet-20240229",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_us_foreign_policy"
    ]
  },
  {
    "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-instant-1.2,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "us_foreign_policy"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "\n\nHuman:",
      "global_suffix": "\n\nAssistant:",
      "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-instant-1.2",
      "model": "anthropic/claude-instant-1.2",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_us_foreign_policy"
    ]
  },
  {
    "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-pro,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "us_foreign_policy"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "google/gemini-pro",
      "model": "google/gemini-pro",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_us_foreign_policy"
    ]
  },
  {
    "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "us_foreign_policy"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/gemma-7b",
      "model": "google/gemma-7b",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_us_foreign_policy"
    ]
  },
  {
    "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-7b-it,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "us_foreign_policy"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/gemma-7b-it",
      "model": "google/gemma-7b-it",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_us_foreign_policy"
    ]
  },
  {
    "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_text-bison@001,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "us_foreign_policy"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "google/text-bison@001",
      "model": "google/text-bison@001",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_us_foreign_policy"
    ]
  },
  {
    "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_text-unicorn@001,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "us_foreign_policy"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "google/text-unicorn@001",
      "model": "google/text-unicorn@001",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_us_foreign_policy"
    ]
  },
  {
    "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "us_foreign_policy"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/llama-2-7b",
      "model": "meta/llama-2-7b",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_us_foreign_policy"
    ]
  },
  {
    "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-2,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "us_foreign_policy"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/phi-2",
      "model": "microsoft/phi-2",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_us_foreign_policy"
    ]
  },
  {
    "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "us_foreign_policy"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/mixtral-8x7b-32kseqlen",
      "model": "mistralai/mixtral-8x7b-32kseqlen",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_us_foreign_policy"
    ]
  },
  {
    "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "us_foreign_policy"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "openai/gpt-3.5-turbo-0613",
      "model": "openai/gpt-3.5-turbo-0613",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_us_foreign_policy"
    ]
  },
  {
    "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-1106-preview,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "us_foreign_policy"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "openai/gpt-4-1106-preview",
      "model": "openai/gpt-4-1106-preview",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_us_foreign_policy"
    ]
  },
  {
    "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
      "args": {
        "subject": "us_foreign_policy"
      }
    },
    "adapter_spec": {
      "method": "multiple_choice_joint",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 10000,
      "num_outputs": 5,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/qwen1.5-7b",
      "model": "qwen/qwen1.5-7b",
      "temperature": 0.0,
      "max_tokens": 1,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false,
      "eval_splits": [
        "test"
      ]
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "prefix_exact_match",
            "quasi_prefix_exact_match"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "mmlu",
      "mmlu_us_foreign_policy"
    ]
  },
  {
    "name": "narrative_qa:model=01-ai_yi-6b,additional_instructions=narrative_qa",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
      "args": {}
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": " Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Passage: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/yi-6b",
      "model": "01-ai/yi-6b",
      "temperature": 0.0,
      "max_tokens": 100,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score",
            "rouge_l",
            "bleu_1",
            "bleu_4"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "narrative_qa"
    ]
  },
  {
    "name": "narrative_qa:model=anthropic_claude-2.1,additional_instructions=narrative_qa",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
      "args": {}
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "\n\nHuman:",
      "global_suffix": "\n\nAssistant:",
      "instructions": " Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Passage: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-2.1",
      "model": "anthropic/claude-2.1",
      "temperature": 0.0,
      "max_tokens": 100,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score",
            "rouge_l",
            "bleu_1",
            "bleu_4"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "narrative_qa"
    ]
  },
  {
    "name": "narrative_qa:model=anthropic_claude-instant-1.2,additional_instructions=narrative_qa",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
      "args": {}
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "\n\nHuman:",
      "global_suffix": "\n\nAssistant:",
      "instructions": " Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Passage: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-instant-1.2",
      "model": "anthropic/claude-instant-1.2",
      "temperature": 0.0,
      "max_tokens": 100,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score",
            "rouge_l",
            "bleu_1",
            "bleu_4"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "narrative_qa"
    ]
  },
  {
    "name": "narrative_qa:model=google_gemma-7b,additional_instructions=narrative_qa",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
      "args": {}
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": " Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Passage: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/gemma-7b",
      "model": "google/gemma-7b",
      "temperature": 0.0,
      "max_tokens": 100,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score",
            "rouge_l",
            "bleu_1",
            "bleu_4"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "narrative_qa"
    ]
  },
  {
    "name": "narrative_qa:model=google_gemma-7b-it,additional_instructions=narrative_qa",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
      "args": {}
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": " Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Passage: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/gemma-7b-it",
      "model": "google/gemma-7b-it",
      "temperature": 0.0,
      "max_tokens": 100,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score",
            "rouge_l",
            "bleu_1",
            "bleu_4"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "narrative_qa"
    ]
  },
  {
    "name": "narrative_qa:model=google_text-bison@001,additional_instructions=narrative_qa",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
      "args": {}
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": " Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Passage: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "google/text-bison@001",
      "model": "google/text-bison@001",
      "temperature": 0.0,
      "max_tokens": 100,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score",
            "rouge_l",
            "bleu_1",
            "bleu_4"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "narrative_qa"
    ]
  },
  {
    "name": "narrative_qa:model=google_text-unicorn@001,additional_instructions=narrative_qa",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
      "args": {}
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": " Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Passage: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "google/text-unicorn@001",
      "model": "google/text-unicorn@001",
      "temperature": 0.0,
      "max_tokens": 100,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score",
            "rouge_l",
            "bleu_1",
            "bleu_4"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "narrative_qa"
    ]
  },
  {
    "name": "narrative_qa:model=meta_llama-2-7b,additional_instructions=narrative_qa",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
      "args": {}
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": " Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Passage: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/llama-2-7b",
      "model": "meta/llama-2-7b",
      "temperature": 0.0,
      "max_tokens": 100,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score",
            "rouge_l",
            "bleu_1",
            "bleu_4"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "narrative_qa"
    ]
  },
  {
    "name": "narrative_qa:model=microsoft_phi-2,additional_instructions=narrative_qa",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
      "args": {}
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": " Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Passage: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/phi-2",
      "model": "microsoft/phi-2",
      "temperature": 0.0,
      "max_tokens": 100,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score",
            "rouge_l",
            "bleu_1",
            "bleu_4"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "narrative_qa"
    ]
  },
  {
    "name": "narrative_qa:model=mistralai_mixtral-8x7b-32kseqlen,additional_instructions=narrative_qa",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
      "args": {}
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": " Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Passage: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/mixtral-8x7b-32kseqlen",
      "model": "mistralai/mixtral-8x7b-32kseqlen",
      "temperature": 0.0,
      "max_tokens": 100,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score",
            "rouge_l",
            "bleu_1",
            "bleu_4"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "narrative_qa"
    ]
  },
  {
    "name": "narrative_qa:model=openai_gpt-3.5-turbo-0613,additional_instructions=narrative_qa",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
      "args": {}
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": " Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Passage: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "openai/gpt-3.5-turbo-0613",
      "model": "openai/gpt-3.5-turbo-0613",
      "temperature": 0.0,
      "max_tokens": 100,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score",
            "rouge_l",
            "bleu_1",
            "bleu_4"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "narrative_qa"
    ]
  },
  {
    "name": "narrative_qa:model=openai_gpt-4-1106-preview,additional_instructions=narrative_qa",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
      "args": {}
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": " Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Passage: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "openai/gpt-4-1106-preview",
      "model": "openai/gpt-4-1106-preview",
      "temperature": 0.0,
      "max_tokens": 100,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score",
            "rouge_l",
            "bleu_1",
            "bleu_4"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "narrative_qa"
    ]
  },
  {
    "name": "narrative_qa:model=qwen_qwen1.5-7b,additional_instructions=narrative_qa",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
      "args": {}
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": " Answer only with a single letter corresponding to the correct option.\n",
      "input_prefix": "Passage: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/qwen1.5-7b",
      "model": "qwen/qwen1.5-7b",
      "temperature": 0.0,
      "max_tokens": 100,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score",
            "rouge_l",
            "bleu_1",
            "bleu_4"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "narrative_qa"
    ]
  },
  {
    "name": "natural_qa:mode=closedbook,model=01-ai_yi-6b,additional_instructions=natural_qa_closedbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "closedbook"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/yi-6b",
      "model": "01-ai/yi-6b",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_closedbook"
    ]
  },
  {
    "name": "natural_qa:mode=closedbook,model=anthropic_claude-2.1,additional_instructions=natural_qa_closedbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "closedbook"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "\n\nHuman:",
      "global_suffix": "\n\nAssistant:",
      "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-2.1",
      "model": "anthropic/claude-2.1",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_closedbook"
    ]
  },
  {
    "name": "natural_qa:mode=closedbook,model=anthropic_claude-instant-1.2,additional_instructions=natural_qa_closedbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "closedbook"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "\n\nHuman:",
      "global_suffix": "\n\nAssistant:",
      "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-instant-1.2",
      "model": "anthropic/claude-instant-1.2",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_closedbook"
    ]
  },
  {
    "name": "natural_qa:mode=closedbook,model=google_gemma-7b,additional_instructions=natural_qa_closedbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "closedbook"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/gemma-7b",
      "model": "google/gemma-7b",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_closedbook"
    ]
  },
  {
    "name": "natural_qa:mode=closedbook,model=google_gemma-7b-it,additional_instructions=natural_qa_closedbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "closedbook"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/gemma-7b-it",
      "model": "google/gemma-7b-it",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_closedbook"
    ]
  },
  {
    "name": "natural_qa:mode=closedbook,model=google_text-bison@001,additional_instructions=natural_qa_closedbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "closedbook"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "google/text-bison@001",
      "model": "google/text-bison@001",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_closedbook"
    ]
  },
  {
    "name": "natural_qa:mode=closedbook,model=google_text-unicorn@001,additional_instructions=natural_qa_closedbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "closedbook"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "google/text-unicorn@001",
      "model": "google/text-unicorn@001",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_closedbook"
    ]
  },
  {
    "name": "natural_qa:mode=closedbook,model=meta_llama-2-7b,additional_instructions=natural_qa_closedbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "closedbook"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/llama-2-7b",
      "model": "meta/llama-2-7b",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_closedbook"
    ]
  },
  {
    "name": "natural_qa:mode=closedbook,model=microsoft_phi-2,additional_instructions=natural_qa_closedbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "closedbook"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/phi-2",
      "model": "microsoft/phi-2",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_closedbook"
    ]
  },
  {
    "name": "natural_qa:mode=closedbook,model=mistralai_mixtral-8x7b-32kseqlen,additional_instructions=natural_qa_closedbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "closedbook"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/mixtral-8x7b-32kseqlen",
      "model": "mistralai/mixtral-8x7b-32kseqlen",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_closedbook"
    ]
  },
  {
    "name": "natural_qa:mode=closedbook,model=openai_gpt-3.5-turbo-0613,additional_instructions=natural_qa_closedbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "closedbook"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "openai/gpt-3.5-turbo-0613",
      "model": "openai/gpt-3.5-turbo-0613",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_closedbook"
    ]
  },
  {
    "name": "natural_qa:mode=closedbook,model=openai_gpt-4-1106-preview,additional_instructions=natural_qa_closedbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "closedbook"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "openai/gpt-4-1106-preview",
      "model": "openai/gpt-4-1106-preview",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_closedbook"
    ]
  },
  {
    "name": "natural_qa:mode=closedbook,model=qwen_qwen1.5-7b,additional_instructions=natural_qa_closedbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "closedbook"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "Question: ",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/qwen1.5-7b",
      "model": "qwen/qwen1.5-7b",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_closedbook"
    ]
  },
  {
    "name": "natural_qa:mode=openbook_longans,model=01-ai_yi-6b,additional_instructions=natural_qa_openbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "openbook_longans"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Answer each of the following questions with a short answer that is a span within the passage or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/yi-6b",
      "model": "01-ai/yi-6b",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_openbook_longans"
    ]
  },
  {
    "name": "natural_qa:mode=openbook_longans,model=google_gemma-7b,additional_instructions=natural_qa_openbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "openbook_longans"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Answer each of the following questions with a short answer that is a span within the passage or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/gemma-7b",
      "model": "google/gemma-7b",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_openbook_longans"
    ]
  },
  {
    "name": "natural_qa:mode=openbook_longans,model=google_gemma-7b-it,additional_instructions=natural_qa_openbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "openbook_longans"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Answer each of the following questions with a short answer that is a span within the passage or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/gemma-7b-it",
      "model": "google/gemma-7b-it",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_openbook_longans"
    ]
  },
  {
    "name": "natural_qa:mode=openbook_longans,model=google_text-bison@001,additional_instructions=natural_qa_openbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "openbook_longans"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Answer each of the following questions with a short answer that is a span within the passage or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "google/text-bison@001",
      "model": "google/text-bison@001",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_openbook_longans"
    ]
  },
  {
    "name": "natural_qa:mode=openbook_longans,model=google_text-unicorn@001,additional_instructions=natural_qa_openbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "openbook_longans"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Answer each of the following questions with a short answer that is a span within the passage or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "google/text-unicorn@001",
      "model": "google/text-unicorn@001",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_openbook_longans"
    ]
  },
  {
    "name": "natural_qa:mode=openbook_longans,model=meta_llama-2-7b,additional_instructions=natural_qa_openbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "openbook_longans"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Answer each of the following questions with a short answer that is a span within the passage or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/llama-2-7b",
      "model": "meta/llama-2-7b",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_openbook_longans"
    ]
  },
  {
    "name": "natural_qa:mode=openbook_longans,model=microsoft_phi-2,additional_instructions=natural_qa_openbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "openbook_longans"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Answer each of the following questions with a short answer that is a span within the passage or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/phi-2",
      "model": "microsoft/phi-2",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_openbook_longans"
    ]
  },
  {
    "name": "natural_qa:mode=openbook_longans,model=mistralai_mixtral-8x7b-32kseqlen,additional_instructions=natural_qa_openbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "openbook_longans"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Answer each of the following questions with a short answer that is a span within the passage or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/mixtral-8x7b-32kseqlen",
      "model": "mistralai/mixtral-8x7b-32kseqlen",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_openbook_longans"
    ]
  },
  {
    "name": "natural_qa:mode=openbook_longans,model=openai_gpt-3.5-turbo-0613,additional_instructions=natural_qa_openbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "openbook_longans"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Answer each of the following questions with a short answer that is a span within the passage or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "openai/gpt-3.5-turbo-0613",
      "model": "openai/gpt-3.5-turbo-0613",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_openbook_longans"
    ]
  },
  {
    "name": "natural_qa:mode=openbook_longans,model=openai_gpt-4-1106-preview,additional_instructions=natural_qa_openbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "openbook_longans"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Answer each of the following questions with a short answer that is a span within the passage or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "openai/gpt-4-1106-preview",
      "model": "openai/gpt-4-1106-preview",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_openbook_longans"
    ]
  },
  {
    "name": "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-7b,additional_instructions=natural_qa_openbook",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
      "args": {
        "mode": "openbook_longans"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Answer each of the following questions with a short answer that is a span within the passage or a boolean 'yes' or 'no' answer.\n",
      "input_prefix": "",
      "input_suffix": "\n",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "Answer: ",
      "output_suffix": "\n",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 5,
      "max_eval_instances": 50,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/qwen1.5-7b",
      "model": "qwen/qwen1.5-7b",
      "temperature": 0.0,
      "max_tokens": 300,
      "stop_sequences": [
        "\n"
      ],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": [
            "exact_match",
            "quasi_exact_match",
            "f1_score"
          ]
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "natural_qa_openbook_longans"
    ]
  }
]