[
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-6b,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "run_spec": {
      "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=01-ai_yi-6b,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "abstract_algebra"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/yi-6b",
        "model": "01-ai/yi-6b",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_abstract_algebra"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 395.67,
        "sum_squared": 156554.7489,
        "min": 395.67,
        "max": 395.67,
        "mean": 395.67,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.29966460943222045,
        "sum_squared": 0.08979887814616523,
        "min": 0.29966460943222045,
        "max": 0.29966460943222045,
        "mean": 0.29966460943222045,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 395.67,
        "sum_squared": 156554.7489,
        "min": 395.67,
        "max": 395.67,
        "mean": 395.67,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 395.67,
        "sum_squared": 156554.7489,
        "min": 395.67,
        "max": 395.67,
        "mean": 395.67,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29966460943222045,
        "sum_squared": 0.08979887814616523,
        "min": 0.29966460943222045,
        "max": 0.29966460943222045,
        "mean": 0.29966460943222045,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29966460943222045,
        "sum_squared": 0.08979887814616523,
        "min": 0.29966460943222045,
        "max": 0.29966460943222045,
        "mean": 0.29966460943222045,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-2.1,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "run_spec": {
      "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-2.1,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "abstract_algebra"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "\n\nHuman:",
        "global_suffix": "\n\nAssistant:",
        "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-2.1",
        "model": "anthropic/claude-2.1",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_abstract_algebra"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 381.26,
        "sum_squared": 145359.1876,
        "min": 381.26,
        "max": 381.26,
        "mean": 381.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.4279474997520447,
        "sum_squared": 2.0390340620481155,
        "min": 1.4279474997520447,
        "max": 1.4279474997520447,
        "mean": 1.4279474997520447,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.31,
        "sum_squared": 0.0961,
        "min": 0.31,
        "max": 0.31,
        "mean": 0.31,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.31,
        "sum_squared": 0.0961,
        "min": 0.31,
        "max": 0.31,
        "mean": 0.31,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.31,
        "sum_squared": 0.0961,
        "min": 0.31,
        "max": 0.31,
        "mean": 0.31,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.31,
        "sum_squared": 0.0961,
        "min": 0.31,
        "max": 0.31,
        "mean": 0.31,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.39,
        "sum_squared": 1.9320999999999997,
        "min": 1.39,
        "max": 1.39,
        "mean": 1.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 381.26,
        "sum_squared": 145359.1876,
        "min": 381.26,
        "max": 381.26,
        "mean": 381.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 381.26,
        "sum_squared": 145359.1876,
        "min": 381.26,
        "max": 381.26,
        "mean": 381.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.4279474997520447,
        "sum_squared": 2.0390340620481155,
        "min": 1.4279474997520447,
        "max": 1.4279474997520447,
        "mean": 1.4279474997520447,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.4279474997520447,
        "sum_squared": 2.0390340620481155,
        "min": 1.4279474997520447,
        "max": 1.4279474997520447,
        "mean": 1.4279474997520447,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.31,
        "sum_squared": 0.0961,
        "min": 0.31,
        "max": 0.31,
        "mean": 0.31,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.31,
        "sum_squared": 0.0961,
        "min": 0.31,
        "max": 0.31,
        "mean": 0.31,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.31,
        "sum_squared": 0.0961,
        "min": 0.31,
        "max": 0.31,
        "mean": 0.31,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.31,
        "sum_squared": 0.0961,
        "min": 0.31,
        "max": 0.31,
        "mean": 0.31,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.31,
        "sum_squared": 0.0961,
        "min": 0.31,
        "max": 0.31,
        "mean": 0.31,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.31,
        "sum_squared": 0.0961,
        "min": 0.31,
        "max": 0.31,
        "mean": 0.31,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.31,
        "sum_squared": 0.0961,
        "min": 0.31,
        "max": 0.31,
        "mean": 0.31,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.31,
        "sum_squared": 0.0961,
        "min": 0.31,
        "max": 0.31,
        "mean": 0.31,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.39,
        "sum_squared": 1.9320999999999997,
        "min": 1.39,
        "max": 1.39,
        "mean": 1.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.39,
        "sum_squared": 1.9320999999999997,
        "min": 1.39,
        "max": 1.39,
        "mean": 1.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "run_spec": {
      "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "abstract_algebra"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-3-opus-20240229",
        "model": "anthropic/claude-3-opus-20240229",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_abstract_algebra"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 373.26,
        "sum_squared": 139323.0276,
        "min": 373.26,
        "max": 373.26,
        "mean": 373.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 4.4627389192581175,
        "sum_squared": 19.916038661461112,
        "min": 4.4627389192581175,
        "max": 4.4627389192581175,
        "mean": 4.4627389192581175,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.63,
        "sum_squared": 0.39690000000000003,
        "min": 0.63,
        "max": 0.63,
        "mean": 0.63,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.63,
        "sum_squared": 0.39690000000000003,
        "min": 0.63,
        "max": 0.63,
        "mean": 0.63,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.63,
        "sum_squared": 0.39690000000000003,
        "min": 0.63,
        "max": 0.63,
        "mean": 0.63,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.63,
        "sum_squared": 0.39690000000000003,
        "min": 0.63,
        "max": 0.63,
        "mean": 0.63,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.64,
        "sum_squared": 0.4096,
        "min": 0.64,
        "max": 0.64,
        "mean": 0.64,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.64,
        "sum_squared": 0.4096,
        "min": 0.64,
        "max": 0.64,
        "mean": 0.64,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 373.26,
        "sum_squared": 139323.0276,
        "min": 373.26,
        "max": 373.26,
        "mean": 373.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 373.26,
        "sum_squared": 139323.0276,
        "min": 373.26,
        "max": 373.26,
        "mean": 373.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.4627389192581175,
        "sum_squared": 19.916038661461112,
        "min": 4.4627389192581175,
        "max": 4.4627389192581175,
        "mean": 4.4627389192581175,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.4627389192581175,
        "sum_squared": 19.916038661461112,
        "min": 4.4627389192581175,
        "max": 4.4627389192581175,
        "mean": 4.4627389192581175,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.63,
        "sum_squared": 0.39690000000000003,
        "min": 0.63,
        "max": 0.63,
        "mean": 0.63,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.63,
        "sum_squared": 0.39690000000000003,
        "min": 0.63,
        "max": 0.63,
        "mean": 0.63,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.63,
        "sum_squared": 0.39690000000000003,
        "min": 0.63,
        "max": 0.63,
        "mean": 0.63,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.63,
        "sum_squared": 0.39690000000000003,
        "min": 0.63,
        "max": 0.63,
        "mean": 0.63,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.63,
        "sum_squared": 0.39690000000000003,
        "min": 0.63,
        "max": 0.63,
        "mean": 0.63,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.63,
        "sum_squared": 0.39690000000000003,
        "min": 0.63,
        "max": 0.63,
        "mean": 0.63,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.63,
        "sum_squared": 0.39690000000000003,
        "min": 0.63,
        "max": 0.63,
        "mean": 0.63,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.63,
        "sum_squared": 0.39690000000000003,
        "min": 0.63,
        "max": 0.63,
        "mean": 0.63,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.64,
        "sum_squared": 0.4096,
        "min": 0.64,
        "max": 0.64,
        "mean": 0.64,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.64,
        "sum_squared": 0.4096,
        "min": 0.64,
        "max": 0.64,
        "mean": 0.64,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.64,
        "sum_squared": 0.4096,
        "min": 0.64,
        "max": 0.64,
        "mean": 0.64,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.64,
        "sum_squared": 0.4096,
        "min": 0.64,
        "max": 0.64,
        "mean": 0.64,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "run_spec": {
      "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "abstract_algebra"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-3-sonnet-20240229",
        "model": "anthropic/claude-3-sonnet-20240229",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_abstract_algebra"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 373.26,
        "sum_squared": 139323.0276,
        "min": 373.26,
        "max": 373.26,
        "mean": 373.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.8704465341567993,
        "sum_squared": 3.4985702371391825,
        "min": 1.8704465341567993,
        "max": 1.8704465341567993,
        "mean": 1.8704465341567993,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.42,
        "sum_squared": 0.17639999999999997,
        "min": 0.42,
        "max": 0.42,
        "mean": 0.42,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.42,
        "sum_squared": 0.17639999999999997,
        "min": 0.42,
        "max": 0.42,
        "mean": 0.42,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 373.26,
        "sum_squared": 139323.0276,
        "min": 373.26,
        "max": 373.26,
        "mean": 373.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 373.26,
        "sum_squared": 139323.0276,
        "min": 373.26,
        "max": 373.26,
        "mean": 373.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.8704465341567993,
        "sum_squared": 3.4985702371391825,
        "min": 1.8704465341567993,
        "max": 1.8704465341567993,
        "mean": 1.8704465341567993,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.8704465341567993,
        "sum_squared": 3.4985702371391825,
        "min": 1.8704465341567993,
        "max": 1.8704465341567993,
        "mean": 1.8704465341567993,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.42,
        "sum_squared": 0.17639999999999997,
        "min": 0.42,
        "max": 0.42,
        "mean": 0.42,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.42,
        "sum_squared": 0.17639999999999997,
        "min": 0.42,
        "max": 0.42,
        "mean": 0.42,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.42,
        "sum_squared": 0.17639999999999997,
        "min": 0.42,
        "max": 0.42,
        "mean": 0.42,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.42,
        "sum_squared": 0.17639999999999997,
        "min": 0.42,
        "max": 0.42,
        "mean": 0.42,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-instant-1.2,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "run_spec": {
      "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=anthropic_claude-instant-1.2,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "abstract_algebra"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "\n\nHuman:",
        "global_suffix": "\n\nAssistant:",
        "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-instant-1.2",
        "model": "anthropic/claude-instant-1.2",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_abstract_algebra"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 381.26,
        "sum_squared": 145359.1876,
        "min": 381.26,
        "max": 381.26,
        "mean": 381.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.8429116034507751,
        "sum_squared": 0.7104999712319567,
        "min": 0.8429116034507751,
        "max": 0.8429116034507751,
        "mean": 0.8429116034507751,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.35,
        "sum_squared": 0.12249999999999998,
        "min": 0.35,
        "max": 0.35,
        "mean": 0.35,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.35,
        "sum_squared": 0.12249999999999998,
        "min": 0.35,
        "max": 0.35,
        "mean": 0.35,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.36,
        "sum_squared": 0.1296,
        "min": 0.36,
        "max": 0.36,
        "mean": 0.36,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.36,
        "sum_squared": 0.1296,
        "min": 0.36,
        "max": 0.36,
        "mean": 0.36,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 381.26,
        "sum_squared": 145359.1876,
        "min": 381.26,
        "max": 381.26,
        "mean": 381.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 381.26,
        "sum_squared": 145359.1876,
        "min": 381.26,
        "max": 381.26,
        "mean": 381.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8429116034507751,
        "sum_squared": 0.7104999712319567,
        "min": 0.8429116034507751,
        "max": 0.8429116034507751,
        "mean": 0.8429116034507751,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8429116034507751,
        "sum_squared": 0.7104999712319567,
        "min": 0.8429116034507751,
        "max": 0.8429116034507751,
        "mean": 0.8429116034507751,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.35,
        "sum_squared": 0.12249999999999998,
        "min": 0.35,
        "max": 0.35,
        "mean": 0.35,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.35,
        "sum_squared": 0.12249999999999998,
        "min": 0.35,
        "max": 0.35,
        "mean": 0.35,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.35,
        "sum_squared": 0.12249999999999998,
        "min": 0.35,
        "max": 0.35,
        "mean": 0.35,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.35,
        "sum_squared": 0.12249999999999998,
        "min": 0.35,
        "max": 0.35,
        "mean": 0.35,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.36,
        "sum_squared": 0.1296,
        "min": 0.36,
        "max": 0.36,
        "mean": 0.36,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.36,
        "sum_squared": 0.1296,
        "min": 0.36,
        "max": 0.36,
        "mean": 0.36,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.36,
        "sum_squared": 0.1296,
        "min": 0.36,
        "max": 0.36,
        "mean": 0.36,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.36,
        "sum_squared": 0.1296,
        "min": 0.36,
        "max": 0.36,
        "mean": 0.36,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-pro,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "run_spec": {
      "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemini-pro,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "abstract_algebra"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/gemini-pro",
        "model": "google/gemini-pro",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_abstract_algebra"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 387.97,
        "sum_squared": 150520.72090000001,
        "min": 387.97,
        "max": 387.97,
        "mean": 387.97,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9730515599250793,
        "sum_squared": 0.9468293382726302,
        "min": 0.9730515599250793,
        "max": 0.9730515599250793,
        "mean": 0.9730515599250793,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.35,
        "sum_squared": 0.12249999999999998,
        "min": 0.35,
        "max": 0.35,
        "mean": 0.35,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.35,
        "sum_squared": 0.12249999999999998,
        "min": 0.35,
        "max": 0.35,
        "mean": 0.35,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 387.97,
        "sum_squared": 150520.72090000001,
        "min": 387.97,
        "max": 387.97,
        "mean": 387.97,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 387.97,
        "sum_squared": 150520.72090000001,
        "min": 387.97,
        "max": 387.97,
        "mean": 387.97,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9730515599250793,
        "sum_squared": 0.9468293382726302,
        "min": 0.9730515599250793,
        "max": 0.9730515599250793,
        "mean": 0.9730515599250793,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9730515599250793,
        "sum_squared": 0.9468293382726302,
        "min": 0.9730515599250793,
        "max": 0.9730515599250793,
        "mean": 0.9730515599250793,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.35,
        "sum_squared": 0.12249999999999998,
        "min": 0.35,
        "max": 0.35,
        "mean": 0.35,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.35,
        "sum_squared": 0.12249999999999998,
        "min": 0.35,
        "max": 0.35,
        "mean": 0.35,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.35,
        "sum_squared": 0.12249999999999998,
        "min": 0.35,
        "max": 0.35,
        "mean": 0.35,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.35,
        "sum_squared": 0.12249999999999998,
        "min": 0.35,
        "max": 0.35,
        "mean": 0.35,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "run_spec": {
      "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "abstract_algebra"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/gemma-7b",
        "model": "google/gemma-7b",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_abstract_algebra"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 387.97,
        "sum_squared": 150520.72090000001,
        "min": 387.97,
        "max": 387.97,
        "mean": 387.97,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.28615272998809815,
        "sum_squared": 0.0818833848796414,
        "min": 0.28615272998809815,
        "max": 0.28615272998809815,
        "mean": 0.28615272998809815,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.24,
        "sum_squared": 0.0576,
        "min": 0.24,
        "max": 0.24,
        "mean": 0.24,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.24,
        "sum_squared": 0.0576,
        "min": 0.24,
        "max": 0.24,
        "mean": 0.24,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.26,
        "sum_squared": 0.06760000000000001,
        "min": 0.26,
        "max": 0.26,
        "mean": 0.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.26,
        "sum_squared": 0.06760000000000001,
        "min": 0.26,
        "max": 0.26,
        "mean": 0.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.24,
        "sum_squared": 0.0576,
        "min": 0.24,
        "max": 0.24,
        "mean": 0.24,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.24,
        "sum_squared": 0.0576,
        "min": 0.24,
        "max": 0.24,
        "mean": 0.24,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.26,
        "sum_squared": 0.06760000000000001,
        "min": 0.26,
        "max": 0.26,
        "mean": 0.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.26,
        "sum_squared": 0.06760000000000001,
        "min": 0.26,
        "max": 0.26,
        "mean": 0.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 387.97,
        "sum_squared": 150520.72090000001,
        "min": 387.97,
        "max": 387.97,
        "mean": 387.97,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 387.97,
        "sum_squared": 150520.72090000001,
        "min": 387.97,
        "max": 387.97,
        "mean": 387.97,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.28615272998809815,
        "sum_squared": 0.0818833848796414,
        "min": 0.28615272998809815,
        "max": 0.28615272998809815,
        "mean": 0.28615272998809815,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.28615272998809815,
        "sum_squared": 0.0818833848796414,
        "min": 0.28615272998809815,
        "max": 0.28615272998809815,
        "mean": 0.28615272998809815,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.24,
        "sum_squared": 0.0576,
        "min": 0.24,
        "max": 0.24,
        "mean": 0.24,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.24,
        "sum_squared": 0.0576,
        "min": 0.24,
        "max": 0.24,
        "mean": 0.24,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.24,
        "sum_squared": 0.0576,
        "min": 0.24,
        "max": 0.24,
        "mean": 0.24,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.24,
        "sum_squared": 0.0576,
        "min": 0.24,
        "max": 0.24,
        "mean": 0.24,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.26,
        "sum_squared": 0.06760000000000001,
        "min": 0.26,
        "max": 0.26,
        "mean": 0.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.26,
        "sum_squared": 0.06760000000000001,
        "min": 0.26,
        "max": 0.26,
        "mean": 0.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.26,
        "sum_squared": 0.06760000000000001,
        "min": 0.26,
        "max": 0.26,
        "mean": 0.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.26,
        "sum_squared": 0.06760000000000001,
        "min": 0.26,
        "max": 0.26,
        "mean": 0.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.24,
        "sum_squared": 0.0576,
        "min": 0.24,
        "max": 0.24,
        "mean": 0.24,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.24,
        "sum_squared": 0.0576,
        "min": 0.24,
        "max": 0.24,
        "mean": 0.24,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.24,
        "sum_squared": 0.0576,
        "min": 0.24,
        "max": 0.24,
        "mean": 0.24,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.24,
        "sum_squared": 0.0576,
        "min": 0.24,
        "max": 0.24,
        "mean": 0.24,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.26,
        "sum_squared": 0.06760000000000001,
        "min": 0.26,
        "max": 0.26,
        "mean": 0.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.26,
        "sum_squared": 0.06760000000000001,
        "min": 0.26,
        "max": 0.26,
        "mean": 0.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.26,
        "sum_squared": 0.06760000000000001,
        "min": 0.26,
        "max": 0.26,
        "mean": 0.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.26,
        "sum_squared": 0.06760000000000001,
        "min": 0.26,
        "max": 0.26,
        "mean": 0.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-7b-it,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "run_spec": {
      "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_gemma-7b-it,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "abstract_algebra"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/gemma-7b-it",
        "model": "google/gemma-7b-it",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_abstract_algebra"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 387.97,
        "sum_squared": 150520.72090000001,
        "min": 387.97,
        "max": 387.97,
        "mean": 387.97,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.27986535549163816,
        "sum_squared": 0.078324617204461,
        "min": 0.27986535549163816,
        "max": 0.27986535549163816,
        "mean": 0.27986535549163816,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 387.97,
        "sum_squared": 150520.72090000001,
        "min": 387.97,
        "max": 387.97,
        "mean": 387.97,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 387.97,
        "sum_squared": 150520.72090000001,
        "min": 387.97,
        "max": 387.97,
        "mean": 387.97,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27986535549163816,
        "sum_squared": 0.078324617204461,
        "min": 0.27986535549163816,
        "max": 0.27986535549163816,
        "mean": 0.27986535549163816,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27986535549163816,
        "sum_squared": 0.078324617204461,
        "min": 0.27986535549163816,
        "max": 0.27986535549163816,
        "mean": 0.27986535549163816,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_text-bison@001,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "run_spec": {
      "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_text-bison@001,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "abstract_algebra"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/text-bison@001",
        "model": "google/text-bison@001",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_abstract_algebra"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 399.12,
        "sum_squared": 159296.7744,
        "min": 399.12,
        "max": 399.12,
        "mean": 399.12,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7742284727096558,
        "sum_squared": 0.5994297279543263,
        "min": 0.7742284727096558,
        "max": 0.7742284727096558,
        "mean": 0.7742284727096558,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 399.12,
        "sum_squared": 159296.7744,
        "min": 399.12,
        "max": 399.12,
        "mean": 399.12,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 399.12,
        "sum_squared": 159296.7744,
        "min": 399.12,
        "max": 399.12,
        "mean": 399.12,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7742284727096558,
        "sum_squared": 0.5994297279543263,
        "min": 0.7742284727096558,
        "max": 0.7742284727096558,
        "mean": 0.7742284727096558,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7742284727096558,
        "sum_squared": 0.5994297279543263,
        "min": 0.7742284727096558,
        "max": 0.7742284727096558,
        "mean": 0.7742284727096558,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_text-unicorn@001,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "run_spec": {
      "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=google_text-unicorn@001,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "abstract_algebra"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/text-unicorn@001",
        "model": "google/text-unicorn@001",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_abstract_algebra"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 399.12,
        "sum_squared": 159296.7744,
        "min": 399.12,
        "max": 399.12,
        "mean": 399.12,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.2322469472885131,
        "sum_squared": 1.5184325391018596,
        "min": 1.2322469472885131,
        "max": 1.2322469472885131,
        "mean": 1.2322469472885131,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.47,
        "sum_squared": 0.22089999999999999,
        "min": 0.47,
        "max": 0.47,
        "mean": 0.47,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.47,
        "sum_squared": 0.22089999999999999,
        "min": 0.47,
        "max": 0.47,
        "mean": 0.47,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.49,
        "sum_squared": 0.24009999999999998,
        "min": 0.49,
        "max": 0.49,
        "mean": 0.49,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.49,
        "sum_squared": 0.24009999999999998,
        "min": 0.49,
        "max": 0.49,
        "mean": 0.49,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 399.12,
        "sum_squared": 159296.7744,
        "min": 399.12,
        "max": 399.12,
        "mean": 399.12,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 399.12,
        "sum_squared": 159296.7744,
        "min": 399.12,
        "max": 399.12,
        "mean": 399.12,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.2322469472885131,
        "sum_squared": 1.5184325391018596,
        "min": 1.2322469472885131,
        "max": 1.2322469472885131,
        "mean": 1.2322469472885131,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.2322469472885131,
        "sum_squared": 1.5184325391018596,
        "min": 1.2322469472885131,
        "max": 1.2322469472885131,
        "mean": 1.2322469472885131,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47,
        "sum_squared": 0.22089999999999999,
        "min": 0.47,
        "max": 0.47,
        "mean": 0.47,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47,
        "sum_squared": 0.22089999999999999,
        "min": 0.47,
        "max": 0.47,
        "mean": 0.47,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47,
        "sum_squared": 0.22089999999999999,
        "min": 0.47,
        "max": 0.47,
        "mean": 0.47,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47,
        "sum_squared": 0.22089999999999999,
        "min": 0.47,
        "max": 0.47,
        "mean": 0.47,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.49,
        "sum_squared": 0.24009999999999998,
        "min": 0.49,
        "max": 0.49,
        "mean": 0.49,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.49,
        "sum_squared": 0.24009999999999998,
        "min": 0.49,
        "max": 0.49,
        "mean": 0.49,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.49,
        "sum_squared": 0.24009999999999998,
        "min": 0.49,
        "max": 0.49,
        "mean": 0.49,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.49,
        "sum_squared": 0.24009999999999998,
        "min": 0.49,
        "max": 0.49,
        "mean": 0.49,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "run_spec": {
      "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=meta_llama-2-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "abstract_algebra"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-2-7b",
        "model": "meta/llama-2-7b",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_abstract_algebra"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 409.65,
        "sum_squared": 167813.12249999997,
        "min": 409.65,
        "max": 409.65,
        "mean": 409.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2816541409492493,
        "sum_squared": 0.07932905511385958,
        "min": 0.2816541409492493,
        "max": 0.2816541409492493,
        "mean": 0.2816541409492493,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 409.65,
        "sum_squared": 167813.12249999997,
        "min": 409.65,
        "max": 409.65,
        "mean": 409.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 409.65,
        "sum_squared": 167813.12249999997,
        "min": 409.65,
        "max": 409.65,
        "mean": 409.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2816541409492493,
        "sum_squared": 0.07932905511385958,
        "min": 0.2816541409492493,
        "max": 0.2816541409492493,
        "mean": 0.2816541409492493,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2816541409492493,
        "sum_squared": 0.07932905511385958,
        "min": 0.2816541409492493,
        "max": 0.2816541409492493,
        "mean": 0.2816541409492493,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-2,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "run_spec": {
      "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=microsoft_phi-2,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "abstract_algebra"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/phi-2",
        "model": "microsoft/phi-2",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_abstract_algebra"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 383.38,
        "sum_squared": 146980.2244,
        "min": 383.38,
        "max": 383.38,
        "mean": 383.38,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2711219882965088,
        "sum_squared": 0.07350713253785225,
        "min": 0.2711219882965088,
        "max": 0.2711219882965088,
        "mean": 0.2711219882965088,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 383.38,
        "sum_squared": 146980.2244,
        "min": 383.38,
        "max": 383.38,
        "mean": 383.38,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 383.38,
        "sum_squared": 146980.2244,
        "min": 383.38,
        "max": 383.38,
        "mean": 383.38,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2711219882965088,
        "sum_squared": 0.07350713253785225,
        "min": 0.2711219882965088,
        "max": 0.2711219882965088,
        "mean": 0.2711219882965088,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2711219882965088,
        "sum_squared": 0.07350713253785225,
        "min": 0.2711219882965088,
        "max": 0.2711219882965088,
        "mean": 0.2711219882965088,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.33,
        "sum_squared": 0.10890000000000001,
        "min": 0.33,
        "max": 0.33,
        "mean": 0.33,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "run_spec": {
      "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "abstract_algebra"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/mixtral-8x7b-32kseqlen",
        "model": "mistralai/mixtral-8x7b-32kseqlen",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_abstract_algebra"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 414.44,
        "sum_squared": 171760.5136,
        "min": 414.44,
        "max": 414.44,
        "mean": 414.44,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3009379529953003,
        "sum_squared": 0.09056365155300157,
        "min": 0.3009379529953003,
        "max": 0.3009379529953003,
        "mean": 0.3009379529953003,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 414.44,
        "sum_squared": 171760.5136,
        "min": 414.44,
        "max": 414.44,
        "mean": 414.44,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 414.44,
        "sum_squared": 171760.5136,
        "min": 414.44,
        "max": 414.44,
        "mean": 414.44,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3009379529953003,
        "sum_squared": 0.09056365155300157,
        "min": 0.3009379529953003,
        "max": 0.3009379529953003,
        "mean": 0.3009379529953003,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3009379529953003,
        "sum_squared": 0.09056365155300157,
        "min": 0.3009379529953003,
        "max": 0.3009379529953003,
        "mean": 0.3009379529953003,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "run_spec": {
      "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "abstract_algebra"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-3.5-turbo-0613",
        "model": "openai/gpt-3.5-turbo-0613",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_abstract_algebra"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 378.44,
        "sum_squared": 143216.8336,
        "min": 378.44,
        "max": 378.44,
        "mean": 378.44,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.444822461605072,
        "sum_squared": 0.19786702234839576,
        "min": 0.444822461605072,
        "max": 0.444822461605072,
        "mean": 0.444822461605072,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 378.44,
        "sum_squared": 143216.8336,
        "min": 378.44,
        "max": 378.44,
        "mean": 378.44,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 378.44,
        "sum_squared": 143216.8336,
        "min": 378.44,
        "max": 378.44,
        "mean": 378.44,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.444822461605072,
        "sum_squared": 0.19786702234839576,
        "min": 0.444822461605072,
        "max": 0.444822461605072,
        "mean": 0.444822461605072,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.444822461605072,
        "sum_squared": 0.19786702234839576,
        "min": 0.444822461605072,
        "max": 0.444822461605072,
        "mean": 0.444822461605072,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32,
        "sum_squared": 0.1024,
        "min": 0.32,
        "max": 0.32,
        "mean": 0.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.34,
        "sum_squared": 0.11560000000000002,
        "min": 0.34,
        "max": 0.34,
        "mean": 0.34,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-1106-preview,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "run_spec": {
      "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=openai_gpt-4-1106-preview,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "abstract_algebra"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-4-1106-preview",
        "model": "openai/gpt-4-1106-preview",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_abstract_algebra"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 378.44,
        "sum_squared": 143216.8336,
        "min": 378.44,
        "max": 378.44,
        "mean": 378.44,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.49671109437942507,
        "sum_squared": 0.2467219112796061,
        "min": 0.49671109437942507,
        "max": 0.49671109437942507,
        "mean": 0.49671109437942507,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 378.44,
        "sum_squared": 143216.8336,
        "min": 378.44,
        "max": 378.44,
        "mean": 378.44,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 378.44,
        "sum_squared": 143216.8336,
        "min": 378.44,
        "max": 378.44,
        "mean": 378.44,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.49671109437942507,
        "sum_squared": 0.2467219112796061,
        "min": 0.49671109437942507,
        "max": 0.49671109437942507,
        "mean": 0.49671109437942507,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.49671109437942507,
        "sum_squared": 0.2467219112796061,
        "min": 0.49671109437942507,
        "max": 0.49671109437942507,
        "mean": 0.49671109437942507,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
    "run_spec": {
      "name": "mmlu:subject=abstract_algebra,method=multiple_choice_joint,model=qwen_qwen1.5-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_abstract_algebra",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "abstract_algebra"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about abstract algebra. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/qwen1.5-7b",
        "model": "qwen/qwen1.5-7b",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_abstract_algebra"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 383.19,
        "sum_squared": 146834.5761,
        "min": 383.19,
        "max": 383.19,
        "mean": 383.19,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2627728796005249,
        "sum_squared": 0.06904958625355197,
        "min": 0.2627728796005249,
        "max": 0.2627728796005249,
        "mean": 0.2627728796005249,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 383.19,
        "sum_squared": 146834.5761,
        "min": 383.19,
        "max": 383.19,
        "mean": 383.19,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 383.19,
        "sum_squared": 146834.5761,
        "min": 383.19,
        "max": 383.19,
        "mean": 383.19,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2627728796005249,
        "sum_squared": 0.06904958625355197,
        "min": 0.2627728796005249,
        "max": 0.2627728796005249,
        "mean": 0.2627728796005249,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2627728796005249,
        "sum_squared": 0.06904958625355197,
        "min": 0.2627728796005249,
        "max": 0.2627728796005249,
        "mean": 0.2627728796005249,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39,
        "sum_squared": 0.1521,
        "min": 0.39,
        "max": 0.39,
        "mean": 0.39,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-6b,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "run_spec": {
      "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=01-ai_yi-6b,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "college_chemistry"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/yi-6b",
        "model": "01-ai/yi-6b",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_college_chemistry"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 609.54,
        "sum_squared": 371539.01159999997,
        "min": 609.54,
        "max": 609.54,
        "mean": 609.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3079850196838379,
        "sum_squared": 0.09485477234965402,
        "min": 0.3079850196838379,
        "max": 0.3079850196838379,
        "mean": 0.3079850196838379,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.42,
        "sum_squared": 0.17639999999999997,
        "min": 0.42,
        "max": 0.42,
        "mean": 0.42,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.42,
        "sum_squared": 0.17639999999999997,
        "min": 0.42,
        "max": 0.42,
        "mean": 0.42,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.42,
        "sum_squared": 0.17639999999999997,
        "min": 0.42,
        "max": 0.42,
        "mean": 0.42,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.42,
        "sum_squared": 0.17639999999999997,
        "min": 0.42,
        "max": 0.42,
        "mean": 0.42,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 609.54,
        "sum_squared": 371539.01159999997,
        "min": 609.54,
        "max": 609.54,
        "mean": 609.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 609.54,
        "sum_squared": 371539.01159999997,
        "min": 609.54,
        "max": 609.54,
        "mean": 609.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3079850196838379,
        "sum_squared": 0.09485477234965402,
        "min": 0.3079850196838379,
        "max": 0.3079850196838379,
        "mean": 0.3079850196838379,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3079850196838379,
        "sum_squared": 0.09485477234965402,
        "min": 0.3079850196838379,
        "max": 0.3079850196838379,
        "mean": 0.3079850196838379,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.42,
        "sum_squared": 0.17639999999999997,
        "min": 0.42,
        "max": 0.42,
        "mean": 0.42,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.42,
        "sum_squared": 0.17639999999999997,
        "min": 0.42,
        "max": 0.42,
        "mean": 0.42,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.42,
        "sum_squared": 0.17639999999999997,
        "min": 0.42,
        "max": 0.42,
        "mean": 0.42,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.42,
        "sum_squared": 0.17639999999999997,
        "min": 0.42,
        "max": 0.42,
        "mean": 0.42,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.42,
        "sum_squared": 0.17639999999999997,
        "min": 0.42,
        "max": 0.42,
        "mean": 0.42,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.42,
        "sum_squared": 0.17639999999999997,
        "min": 0.42,
        "max": 0.42,
        "mean": 0.42,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.42,
        "sum_squared": 0.17639999999999997,
        "min": 0.42,
        "max": 0.42,
        "mean": 0.42,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.42,
        "sum_squared": 0.17639999999999997,
        "min": 0.42,
        "max": 0.42,
        "mean": 0.42,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-2.1,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "run_spec": {
      "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-2.1,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "college_chemistry"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "\n\nHuman:",
        "global_suffix": "\n\nAssistant:",
        "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-2.1",
        "model": "anthropic/claude-2.1",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_college_chemistry"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 561.01,
        "sum_squared": 314732.2201,
        "min": 561.01,
        "max": 561.01,
        "mean": 561.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.4472868704795838,
        "sum_squared": 2.0946392854625877,
        "min": 1.4472868704795838,
        "max": 1.4472868704795838,
        "mean": 1.4472868704795838,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 5.32,
        "sum_squared": 28.302400000000002,
        "min": 5.32,
        "max": 5.32,
        "mean": 5.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 561.01,
        "sum_squared": 314732.2201,
        "min": 561.01,
        "max": 561.01,
        "mean": 561.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 561.01,
        "sum_squared": 314732.2201,
        "min": 561.01,
        "max": 561.01,
        "mean": 561.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.4472868704795838,
        "sum_squared": 2.0946392854625877,
        "min": 1.4472868704795838,
        "max": 1.4472868704795838,
        "mean": 1.4472868704795838,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.4472868704795838,
        "sum_squared": 2.0946392854625877,
        "min": 1.4472868704795838,
        "max": 1.4472868704795838,
        "mean": 1.4472868704795838,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.32,
        "sum_squared": 28.302400000000002,
        "min": 5.32,
        "max": 5.32,
        "mean": 5.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.32,
        "sum_squared": 28.302400000000002,
        "min": 5.32,
        "max": 5.32,
        "mean": 5.32,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "run_spec": {
      "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "college_chemistry"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-3-opus-20240229",
        "model": "anthropic/claude-3-opus-20240229",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_college_chemistry"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 553.01,
        "sum_squared": 305820.0601,
        "min": 553.01,
        "max": 553.01,
        "mean": 553.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 4.720696823596954,
        "sum_squared": 22.284978500318374,
        "min": 4.720696823596954,
        "max": 4.720696823596954,
        "mean": 4.720696823596954,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 553.01,
        "sum_squared": 305820.0601,
        "min": 553.01,
        "max": 553.01,
        "mean": 553.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 553.01,
        "sum_squared": 305820.0601,
        "min": 553.01,
        "max": 553.01,
        "mean": 553.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.720696823596954,
        "sum_squared": 22.284978500318374,
        "min": 4.720696823596954,
        "max": 4.720696823596954,
        "mean": 4.720696823596954,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.720696823596954,
        "sum_squared": 22.284978500318374,
        "min": 4.720696823596954,
        "max": 4.720696823596954,
        "mean": 4.720696823596954,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "run_spec": {
      "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "college_chemistry"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-3-sonnet-20240229",
        "model": "anthropic/claude-3-sonnet-20240229",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_college_chemistry"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 553.01,
        "sum_squared": 305820.0601,
        "min": 553.01,
        "max": 553.01,
        "mean": 553.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.7630544471740723,
        "sum_squared": 3.108360983700274,
        "min": 1.7630544471740723,
        "max": 1.7630544471740723,
        "mean": 1.7630544471740723,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 553.01,
        "sum_squared": 305820.0601,
        "min": 553.01,
        "max": 553.01,
        "mean": 553.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 553.01,
        "sum_squared": 305820.0601,
        "min": 553.01,
        "max": 553.01,
        "mean": 553.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.7630544471740723,
        "sum_squared": 3.108360983700274,
        "min": 1.7630544471740723,
        "max": 1.7630544471740723,
        "mean": 1.7630544471740723,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.7630544471740723,
        "sum_squared": 3.108360983700274,
        "min": 1.7630544471740723,
        "max": 1.7630544471740723,
        "mean": 1.7630544471740723,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-instant-1.2,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "run_spec": {
      "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=anthropic_claude-instant-1.2,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "college_chemistry"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "\n\nHuman:",
        "global_suffix": "\n\nAssistant:",
        "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-instant-1.2",
        "model": "anthropic/claude-instant-1.2",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_college_chemistry"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 561.01,
        "sum_squared": 314732.2201,
        "min": 561.01,
        "max": 561.01,
        "mean": 561.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0819169330596923,
        "sum_squared": 1.1705442500412906,
        "min": 1.0819169330596923,
        "max": 1.0819169330596923,
        "mean": 1.0819169330596923,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 561.01,
        "sum_squared": 314732.2201,
        "min": 561.01,
        "max": 561.01,
        "mean": 561.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 561.01,
        "sum_squared": 314732.2201,
        "min": 561.01,
        "max": 561.01,
        "mean": 561.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0819169330596923,
        "sum_squared": 1.1705442500412906,
        "min": 1.0819169330596923,
        "max": 1.0819169330596923,
        "mean": 1.0819169330596923,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0819169330596923,
        "sum_squared": 1.1705442500412906,
        "min": 1.0819169330596923,
        "max": 1.0819169330596923,
        "mean": 1.0819169330596923,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-pro,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "run_spec": {
      "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemini-pro,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "college_chemistry"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/gemini-pro",
        "model": "google/gemini-pro",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_college_chemistry"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 574.02,
        "sum_squared": 329498.9604,
        "min": 574.02,
        "max": 574.02,
        "mean": 574.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.502180278301239,
        "sum_squared": 0.25218503191470987,
        "min": 0.502180278301239,
        "max": 0.502180278301239,
        "mean": 0.502180278301239,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 574.02,
        "sum_squared": 329498.9604,
        "min": 574.02,
        "max": 574.02,
        "mean": 574.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 574.02,
        "sum_squared": 329498.9604,
        "min": 574.02,
        "max": 574.02,
        "mean": 574.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.502180278301239,
        "sum_squared": 0.25218503191470987,
        "min": 0.502180278301239,
        "max": 0.502180278301239,
        "mean": 0.502180278301239,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.502180278301239,
        "sum_squared": 0.25218503191470987,
        "min": 0.502180278301239,
        "max": 0.502180278301239,
        "mean": 0.502180278301239,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "run_spec": {
      "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "college_chemistry"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/gemma-7b",
        "model": "google/gemma-7b",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_college_chemistry"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 574.02,
        "sum_squared": 329498.9604,
        "min": 574.02,
        "max": 574.02,
        "mean": 574.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.28670950651168825,
        "sum_squared": 0.0822023411241758,
        "min": 0.28670950651168825,
        "max": 0.28670950651168825,
        "mean": 0.28670950651168825,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.49,
        "sum_squared": 0.24009999999999998,
        "min": 0.49,
        "max": 0.49,
        "mean": 0.49,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.49,
        "sum_squared": 0.24009999999999998,
        "min": 0.49,
        "max": 0.49,
        "mean": 0.49,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 574.02,
        "sum_squared": 329498.9604,
        "min": 574.02,
        "max": 574.02,
        "mean": 574.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 574.02,
        "sum_squared": 329498.9604,
        "min": 574.02,
        "max": 574.02,
        "mean": 574.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.28670950651168825,
        "sum_squared": 0.0822023411241758,
        "min": 0.28670950651168825,
        "max": 0.28670950651168825,
        "mean": 0.28670950651168825,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.28670950651168825,
        "sum_squared": 0.0822023411241758,
        "min": 0.28670950651168825,
        "max": 0.28670950651168825,
        "mean": 0.28670950651168825,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.49,
        "sum_squared": 0.24009999999999998,
        "min": 0.49,
        "max": 0.49,
        "mean": 0.49,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.49,
        "sum_squared": 0.24009999999999998,
        "min": 0.49,
        "max": 0.49,
        "mean": 0.49,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.49,
        "sum_squared": 0.24009999999999998,
        "min": 0.49,
        "max": 0.49,
        "mean": 0.49,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.49,
        "sum_squared": 0.24009999999999998,
        "min": 0.49,
        "max": 0.49,
        "mean": 0.49,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-7b-it,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "run_spec": {
      "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_gemma-7b-it,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "college_chemistry"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/gemma-7b-it",
        "model": "google/gemma-7b-it",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_college_chemistry"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 574.02,
        "sum_squared": 329498.9604,
        "min": 574.02,
        "max": 574.02,
        "mean": 574.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2719404149055481,
        "sum_squared": 0.07395158925900165,
        "min": 0.2719404149055481,
        "max": 0.2719404149055481,
        "mean": 0.2719404149055481,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.19,
        "sum_squared": 0.0361,
        "min": 0.19,
        "max": 0.19,
        "mean": 0.19,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.19,
        "sum_squared": 0.0361,
        "min": 0.19,
        "max": 0.19,
        "mean": 0.19,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.19,
        "sum_squared": 0.0361,
        "min": 0.19,
        "max": 0.19,
        "mean": 0.19,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.19,
        "sum_squared": 0.0361,
        "min": 0.19,
        "max": 0.19,
        "mean": 0.19,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2,
        "sum_squared": 0.04000000000000001,
        "min": 0.2,
        "max": 0.2,
        "mean": 0.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2,
        "sum_squared": 0.04000000000000001,
        "min": 0.2,
        "max": 0.2,
        "mean": 0.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2,
        "sum_squared": 0.04000000000000001,
        "min": 0.2,
        "max": 0.2,
        "mean": 0.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2,
        "sum_squared": 0.04000000000000001,
        "min": 0.2,
        "max": 0.2,
        "mean": 0.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 574.02,
        "sum_squared": 329498.9604,
        "min": 574.02,
        "max": 574.02,
        "mean": 574.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 574.02,
        "sum_squared": 329498.9604,
        "min": 574.02,
        "max": 574.02,
        "mean": 574.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2719404149055481,
        "sum_squared": 0.07395158925900165,
        "min": 0.2719404149055481,
        "max": 0.2719404149055481,
        "mean": 0.2719404149055481,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2719404149055481,
        "sum_squared": 0.07395158925900165,
        "min": 0.2719404149055481,
        "max": 0.2719404149055481,
        "mean": 0.2719404149055481,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.19,
        "sum_squared": 0.0361,
        "min": 0.19,
        "max": 0.19,
        "mean": 0.19,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.19,
        "sum_squared": 0.0361,
        "min": 0.19,
        "max": 0.19,
        "mean": 0.19,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.19,
        "sum_squared": 0.0361,
        "min": 0.19,
        "max": 0.19,
        "mean": 0.19,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.19,
        "sum_squared": 0.0361,
        "min": 0.19,
        "max": 0.19,
        "mean": 0.19,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.19,
        "sum_squared": 0.0361,
        "min": 0.19,
        "max": 0.19,
        "mean": 0.19,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.19,
        "sum_squared": 0.0361,
        "min": 0.19,
        "max": 0.19,
        "mean": 0.19,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.19,
        "sum_squared": 0.0361,
        "min": 0.19,
        "max": 0.19,
        "mean": 0.19,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.19,
        "sum_squared": 0.0361,
        "min": 0.19,
        "max": 0.19,
        "mean": 0.19,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2,
        "sum_squared": 0.04000000000000001,
        "min": 0.2,
        "max": 0.2,
        "mean": 0.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2,
        "sum_squared": 0.04000000000000001,
        "min": 0.2,
        "max": 0.2,
        "mean": 0.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2,
        "sum_squared": 0.04000000000000001,
        "min": 0.2,
        "max": 0.2,
        "mean": 0.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2,
        "sum_squared": 0.04000000000000001,
        "min": 0.2,
        "max": 0.2,
        "mean": 0.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2,
        "sum_squared": 0.04000000000000001,
        "min": 0.2,
        "max": 0.2,
        "mean": 0.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2,
        "sum_squared": 0.04000000000000001,
        "min": 0.2,
        "max": 0.2,
        "mean": 0.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2,
        "sum_squared": 0.04000000000000001,
        "min": 0.2,
        "max": 0.2,
        "mean": 0.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2,
        "sum_squared": 0.04000000000000001,
        "min": 0.2,
        "max": 0.2,
        "mean": 0.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_text-bison@001,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "run_spec": {
      "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_text-bison@001,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "college_chemistry"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/text-bison@001",
        "model": "google/text-bison@001",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_college_chemistry"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 585.7,
        "sum_squared": 343044.49000000005,
        "min": 585.7,
        "max": 585.7,
        "mean": 585.7,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7551529717445373,
        "sum_squared": 0.570256010734606,
        "min": 0.7551529717445373,
        "max": 0.7551529717445373,
        "mean": 0.7551529717445373,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 585.7,
        "sum_squared": 343044.49000000005,
        "min": 585.7,
        "max": 585.7,
        "mean": 585.7,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 585.7,
        "sum_squared": 343044.49000000005,
        "min": 585.7,
        "max": 585.7,
        "mean": 585.7,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7551529717445373,
        "sum_squared": 0.570256010734606,
        "min": 0.7551529717445373,
        "max": 0.7551529717445373,
        "mean": 0.7551529717445373,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7551529717445373,
        "sum_squared": 0.570256010734606,
        "min": 0.7551529717445373,
        "max": 0.7551529717445373,
        "mean": 0.7551529717445373,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_text-unicorn@001,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "run_spec": {
      "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=google_text-unicorn@001,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "college_chemistry"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/text-unicorn@001",
        "model": "google/text-unicorn@001",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_college_chemistry"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 585.7,
        "sum_squared": 343044.49000000005,
        "min": 585.7,
        "max": 585.7,
        "mean": 585.7,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.2152568459510804,
        "sum_squared": 1.476849201630968,
        "min": 1.2152568459510804,
        "max": 1.2152568459510804,
        "mean": 1.2152568459510804,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 585.7,
        "sum_squared": 343044.49000000005,
        "min": 585.7,
        "max": 585.7,
        "mean": 585.7,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 585.7,
        "sum_squared": 343044.49000000005,
        "min": 585.7,
        "max": 585.7,
        "mean": 585.7,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.2152568459510804,
        "sum_squared": 1.476849201630968,
        "min": 1.2152568459510804,
        "max": 1.2152568459510804,
        "mean": 1.2152568459510804,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.2152568459510804,
        "sum_squared": 1.476849201630968,
        "min": 1.2152568459510804,
        "max": 1.2152568459510804,
        "mean": 1.2152568459510804,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.57,
        "sum_squared": 0.32489999999999997,
        "min": 0.57,
        "max": 0.57,
        "mean": 0.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "run_spec": {
      "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=meta_llama-2-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "college_chemistry"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-2-7b",
        "model": "meta/llama-2-7b",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_college_chemistry"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 634.43,
        "sum_squared": 402501.4248999999,
        "min": 634.43,
        "max": 634.43,
        "mean": 634.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.30203757047653196,
        "sum_squared": 0.091226693979366,
        "min": 0.30203757047653196,
        "max": 0.30203757047653196,
        "mean": 0.30203757047653196,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.28,
        "sum_squared": 0.07840000000000001,
        "min": 0.28,
        "max": 0.28,
        "mean": 0.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.28,
        "sum_squared": 0.07840000000000001,
        "min": 0.28,
        "max": 0.28,
        "mean": 0.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.28,
        "sum_squared": 0.07840000000000001,
        "min": 0.28,
        "max": 0.28,
        "mean": 0.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.28,
        "sum_squared": 0.07840000000000001,
        "min": 0.28,
        "max": 0.28,
        "mean": 0.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 634.43,
        "sum_squared": 402501.4248999999,
        "min": 634.43,
        "max": 634.43,
        "mean": 634.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 634.43,
        "sum_squared": 402501.4248999999,
        "min": 634.43,
        "max": 634.43,
        "mean": 634.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.30203757047653196,
        "sum_squared": 0.091226693979366,
        "min": 0.30203757047653196,
        "max": 0.30203757047653196,
        "mean": 0.30203757047653196,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.30203757047653196,
        "sum_squared": 0.091226693979366,
        "min": 0.30203757047653196,
        "max": 0.30203757047653196,
        "mean": 0.30203757047653196,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27,
        "sum_squared": 0.0729,
        "min": 0.27,
        "max": 0.27,
        "mean": 0.27,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.28,
        "sum_squared": 0.07840000000000001,
        "min": 0.28,
        "max": 0.28,
        "mean": 0.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.28,
        "sum_squared": 0.07840000000000001,
        "min": 0.28,
        "max": 0.28,
        "mean": 0.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.28,
        "sum_squared": 0.07840000000000001,
        "min": 0.28,
        "max": 0.28,
        "mean": 0.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.28,
        "sum_squared": 0.07840000000000001,
        "min": 0.28,
        "max": 0.28,
        "mean": 0.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.28,
        "sum_squared": 0.07840000000000001,
        "min": 0.28,
        "max": 0.28,
        "mean": 0.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.28,
        "sum_squared": 0.07840000000000001,
        "min": 0.28,
        "max": 0.28,
        "mean": 0.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.28,
        "sum_squared": 0.07840000000000001,
        "min": 0.28,
        "max": 0.28,
        "mean": 0.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.28,
        "sum_squared": 0.07840000000000001,
        "min": 0.28,
        "max": 0.28,
        "mean": 0.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-2,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "run_spec": {
      "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=microsoft_phi-2,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "college_chemistry"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/phi-2",
        "model": "microsoft/phi-2",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_college_chemistry"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 557.4,
        "sum_squared": 310694.75999999995,
        "min": 557.4,
        "max": 557.4,
        "mean": 557.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2787207221984863,
        "sum_squared": 0.07768524098284578,
        "min": 0.2787207221984863,
        "max": 0.2787207221984863,
        "mean": 0.2787207221984863,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 557.4,
        "sum_squared": 310694.75999999995,
        "min": 557.4,
        "max": 557.4,
        "mean": 557.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 557.4,
        "sum_squared": 310694.75999999995,
        "min": 557.4,
        "max": 557.4,
        "mean": 557.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2787207221984863,
        "sum_squared": 0.07768524098284578,
        "min": 0.2787207221984863,
        "max": 0.2787207221984863,
        "mean": 0.2787207221984863,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2787207221984863,
        "sum_squared": 0.07768524098284578,
        "min": 0.2787207221984863,
        "max": 0.2787207221984863,
        "mean": 0.2787207221984863,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41,
        "sum_squared": 0.16809999999999997,
        "min": 0.41,
        "max": 0.41,
        "mean": 0.41,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "run_spec": {
      "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "college_chemistry"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/mixtral-8x7b-32kseqlen",
        "model": "mistralai/mixtral-8x7b-32kseqlen",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_college_chemistry"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 639.71,
        "sum_squared": 409228.8841,
        "min": 639.71,
        "max": 639.71,
        "mean": 639.71,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32694086313247683,
        "sum_squared": 0.10689032798580896,
        "min": 0.32694086313247683,
        "max": 0.32694086313247683,
        "mean": 0.32694086313247683,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 639.71,
        "sum_squared": 409228.8841,
        "min": 639.71,
        "max": 639.71,
        "mean": 639.71,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 639.71,
        "sum_squared": 409228.8841,
        "min": 639.71,
        "max": 639.71,
        "mean": 639.71,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32694086313247683,
        "sum_squared": 0.10689032798580896,
        "min": 0.32694086313247683,
        "max": 0.32694086313247683,
        "mean": 0.32694086313247683,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32694086313247683,
        "sum_squared": 0.10689032798580896,
        "min": 0.32694086313247683,
        "max": 0.32694086313247683,
        "mean": 0.32694086313247683,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "run_spec": {
      "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "college_chemistry"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-3.5-turbo-0613",
        "model": "openai/gpt-3.5-turbo-0613",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_college_chemistry"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 554.4,
        "sum_squared": 307359.36,
        "min": 554.4,
        "max": 554.4,
        "mean": 554.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3874256682395935,
        "sum_squared": 0.15009864841089557,
        "min": 0.3874256682395935,
        "max": 0.3874256682395935,
        "mean": 0.3874256682395935,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 554.4,
        "sum_squared": 307359.36,
        "min": 554.4,
        "max": 554.4,
        "mean": 554.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 554.4,
        "sum_squared": 307359.36,
        "min": 554.4,
        "max": 554.4,
        "mean": 554.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3874256682395935,
        "sum_squared": 0.15009864841089557,
        "min": 0.3874256682395935,
        "max": 0.3874256682395935,
        "mean": 0.3874256682395935,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3874256682395935,
        "sum_squared": 0.15009864841089557,
        "min": 0.3874256682395935,
        "max": 0.3874256682395935,
        "mean": 0.3874256682395935,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.53,
        "sum_squared": 0.28090000000000004,
        "min": 0.53,
        "max": 0.53,
        "mean": 0.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-1106-preview,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "run_spec": {
      "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=openai_gpt-4-1106-preview,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "college_chemistry"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-4-1106-preview",
        "model": "openai/gpt-4-1106-preview",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_college_chemistry"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 554.4,
        "sum_squared": 307359.36,
        "min": 554.4,
        "max": 554.4,
        "mean": 554.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5093840050697327,
        "sum_squared": 0.25947206462088146,
        "min": 0.5093840050697327,
        "max": 0.5093840050697327,
        "mean": 0.5093840050697327,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.56,
        "sum_squared": 0.31360000000000005,
        "min": 0.56,
        "max": 0.56,
        "mean": 0.56,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.56,
        "sum_squared": 0.31360000000000005,
        "min": 0.56,
        "max": 0.56,
        "mean": 0.56,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.56,
        "sum_squared": 0.31360000000000005,
        "min": 0.56,
        "max": 0.56,
        "mean": 0.56,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.56,
        "sum_squared": 0.31360000000000005,
        "min": 0.56,
        "max": 0.56,
        "mean": 0.56,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 554.4,
        "sum_squared": 307359.36,
        "min": 554.4,
        "max": 554.4,
        "mean": 554.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 554.4,
        "sum_squared": 307359.36,
        "min": 554.4,
        "max": 554.4,
        "mean": 554.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5093840050697327,
        "sum_squared": 0.25947206462088146,
        "min": 0.5093840050697327,
        "max": 0.5093840050697327,
        "mean": 0.5093840050697327,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5093840050697327,
        "sum_squared": 0.25947206462088146,
        "min": 0.5093840050697327,
        "max": 0.5093840050697327,
        "mean": 0.5093840050697327,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.56,
        "sum_squared": 0.31360000000000005,
        "min": 0.56,
        "max": 0.56,
        "mean": 0.56,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.56,
        "sum_squared": 0.31360000000000005,
        "min": 0.56,
        "max": 0.56,
        "mean": 0.56,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.56,
        "sum_squared": 0.31360000000000005,
        "min": 0.56,
        "max": 0.56,
        "mean": 0.56,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.56,
        "sum_squared": 0.31360000000000005,
        "min": 0.56,
        "max": 0.56,
        "mean": 0.56,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.56,
        "sum_squared": 0.31360000000000005,
        "min": 0.56,
        "max": 0.56,
        "mean": 0.56,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.56,
        "sum_squared": 0.31360000000000005,
        "min": 0.56,
        "max": 0.56,
        "mean": 0.56,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.56,
        "sum_squared": 0.31360000000000005,
        "min": 0.56,
        "max": 0.56,
        "mean": 0.56,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.56,
        "sum_squared": 0.31360000000000005,
        "min": 0.56,
        "max": 0.56,
        "mean": 0.56,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
    "run_spec": {
      "name": "mmlu:subject=college_chemistry,method=multiple_choice_joint,model=qwen_qwen1.5-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_college_chemistry",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "college_chemistry"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about college chemistry. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/qwen1.5-7b",
        "model": "qwen/qwen1.5-7b",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_college_chemistry"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 573.25,
        "sum_squared": 328615.5625,
        "min": 573.25,
        "max": 573.25,
        "mean": 573.25,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2913021016120911,
        "sum_squared": 0.08485691440362103,
        "min": 0.2913021016120911,
        "max": 0.2913021016120911,
        "mean": 0.2913021016120911,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 573.25,
        "sum_squared": 328615.5625,
        "min": 573.25,
        "max": 573.25,
        "mean": 573.25,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 573.25,
        "sum_squared": 328615.5625,
        "min": 573.25,
        "max": 573.25,
        "mean": 573.25,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2913021016120911,
        "sum_squared": 0.08485691440362103,
        "min": 0.2913021016120911,
        "max": 0.2913021016120911,
        "mean": 0.2913021016120911,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2913021016120911,
        "sum_squared": 0.08485691440362103,
        "min": 0.2913021016120911,
        "max": 0.2913021016120911,
        "mean": 0.2913021016120911,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.43,
        "sum_squared": 0.18489999999999998,
        "min": 0.43,
        "max": 0.43,
        "mean": 0.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-6b,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "run_spec": {
      "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=01-ai_yi-6b,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "computer_security"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/yi-6b",
        "model": "01-ai/yi-6b",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_computer_security"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 417.74,
        "sum_squared": 174506.7076,
        "min": 417.74,
        "max": 417.74,
        "mean": 417.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32253321409225466,
        "sum_squared": 0.10402767419268018,
        "min": 0.32253321409225466,
        "max": 0.32253321409225466,
        "mean": 0.32253321409225466,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 417.74,
        "sum_squared": 174506.7076,
        "min": 417.74,
        "max": 417.74,
        "mean": 417.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 417.74,
        "sum_squared": 174506.7076,
        "min": 417.74,
        "max": 417.74,
        "mean": 417.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32253321409225466,
        "sum_squared": 0.10402767419268018,
        "min": 0.32253321409225466,
        "max": 0.32253321409225466,
        "mean": 0.32253321409225466,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32253321409225466,
        "sum_squared": 0.10402767419268018,
        "min": 0.32253321409225466,
        "max": 0.32253321409225466,
        "mean": 0.32253321409225466,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.74,
        "sum_squared": 0.5476,
        "min": 0.74,
        "max": 0.74,
        "mean": 0.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-2.1,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "run_spec": {
      "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-2.1,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "computer_security"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "\n\nHuman:",
        "global_suffix": "\n\nAssistant:",
        "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-2.1",
        "model": "anthropic/claude-2.1",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_computer_security"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 409.62,
        "sum_squared": 167788.5444,
        "min": 409.62,
        "max": 409.62,
        "mean": 409.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.2952637958526612,
        "sum_squared": 1.6777083008466442,
        "min": 1.2952637958526612,
        "max": 1.2952637958526612,
        "mean": 1.2952637958526612,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 3.81,
        "sum_squared": 14.5161,
        "min": 3.81,
        "max": 3.81,
        "mean": 3.81,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 409.62,
        "sum_squared": 167788.5444,
        "min": 409.62,
        "max": 409.62,
        "mean": 409.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 409.62,
        "sum_squared": 167788.5444,
        "min": 409.62,
        "max": 409.62,
        "mean": 409.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.2952637958526612,
        "sum_squared": 1.6777083008466442,
        "min": 1.2952637958526612,
        "max": 1.2952637958526612,
        "mean": 1.2952637958526612,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.2952637958526612,
        "sum_squared": 1.6777083008466442,
        "min": 1.2952637958526612,
        "max": 1.2952637958526612,
        "mean": 1.2952637958526612,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.59,
        "sum_squared": 0.34809999999999997,
        "min": 0.59,
        "max": 0.59,
        "mean": 0.59,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.81,
        "sum_squared": 14.5161,
        "min": 3.81,
        "max": 3.81,
        "mean": 3.81,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.81,
        "sum_squared": 14.5161,
        "min": 3.81,
        "max": 3.81,
        "mean": 3.81,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "run_spec": {
      "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "computer_security"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-3-opus-20240229",
        "model": "anthropic/claude-3-opus-20240229",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_computer_security"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 401.62,
        "sum_squared": 161298.6244,
        "min": 401.62,
        "max": 401.62,
        "mean": 401.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 4.12412523984909,
        "sum_squared": 17.008408993960316,
        "min": 4.12412523984909,
        "max": 4.12412523984909,
        "mean": 4.12412523984909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 401.62,
        "sum_squared": 161298.6244,
        "min": 401.62,
        "max": 401.62,
        "mean": 401.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 401.62,
        "sum_squared": 161298.6244,
        "min": 401.62,
        "max": 401.62,
        "mean": 401.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.12412523984909,
        "sum_squared": 17.008408993960316,
        "min": 4.12412523984909,
        "max": 4.12412523984909,
        "mean": 4.12412523984909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.12412523984909,
        "sum_squared": 17.008408993960316,
        "min": 4.12412523984909,
        "max": 4.12412523984909,
        "mean": 4.12412523984909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "run_spec": {
      "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "computer_security"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-3-sonnet-20240229",
        "model": "anthropic/claude-3-sonnet-20240229",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_computer_security"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 401.62,
        "sum_squared": 161298.6244,
        "min": 401.62,
        "max": 401.62,
        "mean": 401.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.7186831426620484,
        "sum_squared": 2.953871744870695,
        "min": 1.7186831426620484,
        "max": 1.7186831426620484,
        "mean": 1.7186831426620484,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 401.62,
        "sum_squared": 161298.6244,
        "min": 401.62,
        "max": 401.62,
        "mean": 401.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 401.62,
        "sum_squared": 161298.6244,
        "min": 401.62,
        "max": 401.62,
        "mean": 401.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.7186831426620484,
        "sum_squared": 2.953871744870695,
        "min": 1.7186831426620484,
        "max": 1.7186831426620484,
        "mean": 1.7186831426620484,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.7186831426620484,
        "sum_squared": 2.953871744870695,
        "min": 1.7186831426620484,
        "max": 1.7186831426620484,
        "mean": 1.7186831426620484,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-instant-1.2,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "run_spec": {
      "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=anthropic_claude-instant-1.2,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "computer_security"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "\n\nHuman:",
        "global_suffix": "\n\nAssistant:",
        "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-instant-1.2",
        "model": "anthropic/claude-instant-1.2",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_computer_security"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 409.62,
        "sum_squared": 167788.5444,
        "min": 409.62,
        "max": 409.62,
        "mean": 409.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.89654052734375,
        "sum_squared": 0.8037849171698094,
        "min": 0.89654052734375,
        "max": 0.89654052734375,
        "mean": 0.89654052734375,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 409.62,
        "sum_squared": 167788.5444,
        "min": 409.62,
        "max": 409.62,
        "mean": 409.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 409.62,
        "sum_squared": 167788.5444,
        "min": 409.62,
        "max": 409.62,
        "mean": 409.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.89654052734375,
        "sum_squared": 0.8037849171698094,
        "min": 0.89654052734375,
        "max": 0.89654052734375,
        "mean": 0.89654052734375,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.89654052734375,
        "sum_squared": 0.8037849171698094,
        "min": 0.89654052734375,
        "max": 0.89654052734375,
        "mean": 0.89654052734375,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.76,
        "sum_squared": 0.5776,
        "min": 0.76,
        "max": 0.76,
        "mean": 0.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-pro,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "run_spec": {
      "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemini-pro,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "computer_security"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/gemini-pro",
        "model": "google/gemini-pro",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_computer_security"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 384.91,
        "sum_squared": 148155.70810000002,
        "min": 384.91,
        "max": 384.91,
        "mean": 384.91,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.39635706424713135,
        "sum_squared": 0.15709892237860462,
        "min": 0.39635706424713135,
        "max": 0.39635706424713135,
        "mean": 0.39635706424713135,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 384.91,
        "sum_squared": 148155.70810000002,
        "min": 384.91,
        "max": 384.91,
        "mean": 384.91,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 384.91,
        "sum_squared": 148155.70810000002,
        "min": 384.91,
        "max": 384.91,
        "mean": 384.91,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39635706424713135,
        "sum_squared": 0.15709892237860462,
        "min": 0.39635706424713135,
        "max": 0.39635706424713135,
        "mean": 0.39635706424713135,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.39635706424713135,
        "sum_squared": 0.15709892237860462,
        "min": 0.39635706424713135,
        "max": 0.39635706424713135,
        "mean": 0.39635706424713135,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.83,
        "sum_squared": 0.6889,
        "min": 0.83,
        "max": 0.83,
        "mean": 0.83,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "run_spec": {
      "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "computer_security"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/gemma-7b",
        "model": "google/gemma-7b",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_computer_security"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 384.91,
        "sum_squared": 148155.70810000002,
        "min": 384.91,
        "max": 384.91,
        "mean": 384.91,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2700689482688904,
        "sum_squared": 0.0729372368190646,
        "min": 0.2700689482688904,
        "max": 0.2700689482688904,
        "mean": 0.2700689482688904,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 384.91,
        "sum_squared": 148155.70810000002,
        "min": 384.91,
        "max": 384.91,
        "mean": 384.91,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 384.91,
        "sum_squared": 148155.70810000002,
        "min": 384.91,
        "max": 384.91,
        "mean": 384.91,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2700689482688904,
        "sum_squared": 0.0729372368190646,
        "min": 0.2700689482688904,
        "max": 0.2700689482688904,
        "mean": 0.2700689482688904,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2700689482688904,
        "sum_squared": 0.0729372368190646,
        "min": 0.2700689482688904,
        "max": 0.2700689482688904,
        "mean": 0.2700689482688904,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-7b-it,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "run_spec": {
      "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_gemma-7b-it,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "computer_security"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/gemma-7b-it",
        "model": "google/gemma-7b-it",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_computer_security"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 384.91,
        "sum_squared": 148155.70810000002,
        "min": 384.91,
        "max": 384.91,
        "mean": 384.91,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.29032307147979736,
        "sum_squared": 0.08428748583346353,
        "min": 0.29032307147979736,
        "max": 0.29032307147979736,
        "mean": 0.29032307147979736,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 384.91,
        "sum_squared": 148155.70810000002,
        "min": 384.91,
        "max": 384.91,
        "mean": 384.91,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 384.91,
        "sum_squared": 148155.70810000002,
        "min": 384.91,
        "max": 384.91,
        "mean": 384.91,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29032307147979736,
        "sum_squared": 0.08428748583346353,
        "min": 0.29032307147979736,
        "max": 0.29032307147979736,
        "mean": 0.29032307147979736,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29032307147979736,
        "sum_squared": 0.08428748583346353,
        "min": 0.29032307147979736,
        "max": 0.29032307147979736,
        "mean": 0.29032307147979736,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.17,
        "sum_squared": 0.028900000000000006,
        "min": 0.17,
        "max": 0.17,
        "mean": 0.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=computer_security,method=multiple_choice_joint,model=google_text-bison@001,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "run_spec": {
      "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_text-bison@001,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "computer_security"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/text-bison@001",
        "model": "google/text-bison@001",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_computer_security"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 396.24,
        "sum_squared": 157006.13760000002,
        "min": 396.24,
        "max": 396.24,
        "mean": 396.24,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7861250281333924,
        "sum_squared": 0.6179925598577269,
        "min": 0.7861250281333924,
        "max": 0.7861250281333924,
        "mean": 0.7861250281333924,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 396.24,
        "sum_squared": 157006.13760000002,
        "min": 396.24,
        "max": 396.24,
        "mean": 396.24,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 396.24,
        "sum_squared": 157006.13760000002,
        "min": 396.24,
        "max": 396.24,
        "mean": 396.24,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7861250281333924,
        "sum_squared": 0.6179925598577269,
        "min": 0.7861250281333924,
        "max": 0.7861250281333924,
        "mean": 0.7861250281333924,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7861250281333924,
        "sum_squared": 0.6179925598577269,
        "min": 0.7861250281333924,
        "max": 0.7861250281333924,
        "mean": 0.7861250281333924,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=computer_security,method=multiple_choice_joint,model=google_text-unicorn@001,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "run_spec": {
      "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=google_text-unicorn@001,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "computer_security"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/text-unicorn@001",
        "model": "google/text-unicorn@001",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_computer_security"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 396.24,
        "sum_squared": 157006.13760000002,
        "min": 396.24,
        "max": 396.24,
        "mean": 396.24,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.411904981136322,
        "sum_squared": 1.9934756757575578,
        "min": 1.411904981136322,
        "max": 1.411904981136322,
        "mean": 1.411904981136322,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 396.24,
        "sum_squared": 157006.13760000002,
        "min": 396.24,
        "max": 396.24,
        "mean": 396.24,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 396.24,
        "sum_squared": 157006.13760000002,
        "min": 396.24,
        "max": 396.24,
        "mean": 396.24,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.411904981136322,
        "sum_squared": 1.9934756757575578,
        "min": 1.411904981136322,
        "max": 1.411904981136322,
        "mean": 1.411904981136322,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.411904981136322,
        "sum_squared": 1.9934756757575578,
        "min": 1.411904981136322,
        "max": 1.411904981136322,
        "mean": 1.411904981136322,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "run_spec": {
      "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=meta_llama-2-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "computer_security"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-2-7b",
        "model": "meta/llama-2-7b",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_computer_security"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 440.17,
        "sum_squared": 193749.6289,
        "min": 440.17,
        "max": 440.17,
        "mean": 440.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2909871745109558,
        "sum_squared": 0.08467353572986944,
        "min": 0.2909871745109558,
        "max": 0.2909871745109558,
        "mean": 0.2909871745109558,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 440.17,
        "sum_squared": 193749.6289,
        "min": 440.17,
        "max": 440.17,
        "mean": 440.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 440.17,
        "sum_squared": 193749.6289,
        "min": 440.17,
        "max": 440.17,
        "mean": 440.17,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2909871745109558,
        "sum_squared": 0.08467353572986944,
        "min": 0.2909871745109558,
        "max": 0.2909871745109558,
        "mean": 0.2909871745109558,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2909871745109558,
        "sum_squared": 0.08467353572986944,
        "min": 0.2909871745109558,
        "max": 0.2909871745109558,
        "mean": 0.2909871745109558,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-2,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "run_spec": {
      "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=microsoft_phi-2,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "computer_security"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/phi-2",
        "model": "microsoft/phi-2",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_computer_security"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 399.4,
        "sum_squared": 159520.36,
        "min": 399.4,
        "max": 399.4,
        "mean": 399.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2723871803283691,
        "sum_squared": 0.07419477600723948,
        "min": 0.2723871803283691,
        "max": 0.2723871803283691,
        "mean": 0.2723871803283691,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 399.4,
        "sum_squared": 159520.36,
        "min": 399.4,
        "max": 399.4,
        "mean": 399.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 399.4,
        "sum_squared": 159520.36,
        "min": 399.4,
        "max": 399.4,
        "mean": 399.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2723871803283691,
        "sum_squared": 0.07419477600723948,
        "min": 0.2723871803283691,
        "max": 0.2723871803283691,
        "mean": 0.2723871803283691,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2723871803283691,
        "sum_squared": 0.07419477600723948,
        "min": 0.2723871803283691,
        "max": 0.2723871803283691,
        "mean": 0.2723871803283691,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.73,
        "sum_squared": 0.5328999999999999,
        "min": 0.73,
        "max": 0.73,
        "mean": 0.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "run_spec": {
      "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "computer_security"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/mixtral-8x7b-32kseqlen",
        "model": "mistralai/mixtral-8x7b-32kseqlen",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_computer_security"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 436.94,
        "sum_squared": 190916.5636,
        "min": 436.94,
        "max": 436.94,
        "mean": 436.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2954034161567688,
        "sum_squared": 0.08726317827708913,
        "min": 0.2954034161567688,
        "max": 0.2954034161567688,
        "mean": 0.2954034161567688,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 436.94,
        "sum_squared": 190916.5636,
        "min": 436.94,
        "max": 436.94,
        "mean": 436.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 436.94,
        "sum_squared": 190916.5636,
        "min": 436.94,
        "max": 436.94,
        "mean": 436.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2954034161567688,
        "sum_squared": 0.08726317827708913,
        "min": 0.2954034161567688,
        "max": 0.2954034161567688,
        "mean": 0.2954034161567688,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2954034161567688,
        "sum_squared": 0.08726317827708913,
        "min": 0.2954034161567688,
        "max": 0.2954034161567688,
        "mean": 0.2954034161567688,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.82,
        "sum_squared": 0.6723999999999999,
        "min": 0.82,
        "max": 0.82,
        "mean": 0.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "run_spec": {
      "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "computer_security"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-3.5-turbo-0613",
        "model": "openai/gpt-3.5-turbo-0613",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_computer_security"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 383.54,
        "sum_squared": 147102.9316,
        "min": 383.54,
        "max": 383.54,
        "mean": 383.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3922041606903076,
        "sum_squared": 0.15382410366278865,
        "min": 0.3922041606903076,
        "max": 0.3922041606903076,
        "mean": 0.3922041606903076,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 383.54,
        "sum_squared": 147102.9316,
        "min": 383.54,
        "max": 383.54,
        "mean": 383.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 383.54,
        "sum_squared": 147102.9316,
        "min": 383.54,
        "max": 383.54,
        "mean": 383.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3922041606903076,
        "sum_squared": 0.15382410366278865,
        "min": 0.3922041606903076,
        "max": 0.3922041606903076,
        "mean": 0.3922041606903076,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3922041606903076,
        "sum_squared": 0.15382410366278865,
        "min": 0.3922041606903076,
        "max": 0.3922041606903076,
        "mean": 0.3922041606903076,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-1106-preview,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "run_spec": {
      "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=openai_gpt-4-1106-preview,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "computer_security"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-4-1106-preview",
        "model": "openai/gpt-4-1106-preview",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_computer_security"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 383.54,
        "sum_squared": 147102.9316,
        "min": 383.54,
        "max": 383.54,
        "mean": 383.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.4907408857345581,
        "sum_squared": 0.24082661693153862,
        "min": 0.4907408857345581,
        "max": 0.4907408857345581,
        "mean": 0.4907408857345581,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 383.54,
        "sum_squared": 147102.9316,
        "min": 383.54,
        "max": 383.54,
        "mean": 383.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 383.54,
        "sum_squared": 147102.9316,
        "min": 383.54,
        "max": 383.54,
        "mean": 383.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4907408857345581,
        "sum_squared": 0.24082661693153862,
        "min": 0.4907408857345581,
        "max": 0.4907408857345581,
        "mean": 0.4907408857345581,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4907408857345581,
        "sum_squared": 0.24082661693153862,
        "min": 0.4907408857345581,
        "max": 0.4907408857345581,
        "mean": 0.4907408857345581,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.84,
        "sum_squared": 0.7055999999999999,
        "min": 0.84,
        "max": 0.84,
        "mean": 0.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
    "run_spec": {
      "name": "mmlu:subject=computer_security,method=multiple_choice_joint,model=qwen_qwen1.5-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_computer_security",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "computer_security"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about computer security. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/qwen1.5-7b",
        "model": "qwen/qwen1.5-7b",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_computer_security"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 391.64,
        "sum_squared": 153381.8896,
        "min": 391.64,
        "max": 391.64,
        "mean": 391.64,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2926693534851074,
        "sum_squared": 0.08565535046939075,
        "min": 0.2926693534851074,
        "max": 0.2926693534851074,
        "mean": 0.2926693534851074,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 391.64,
        "sum_squared": 153381.8896,
        "min": 391.64,
        "max": 391.64,
        "mean": 391.64,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 391.64,
        "sum_squared": 153381.8896,
        "min": 391.64,
        "max": 391.64,
        "mean": 391.64,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2926693534851074,
        "sum_squared": 0.08565535046939075,
        "min": 0.2926693534851074,
        "max": 0.2926693534851074,
        "mean": 0.2926693534851074,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2926693534851074,
        "sum_squared": 0.08565535046939075,
        "min": 0.2926693534851074,
        "max": 0.2926693534851074,
        "mean": 0.2926693534851074,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.75,
        "sum_squared": 0.5625,
        "min": 0.75,
        "max": 0.75,
        "mean": 0.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-6b,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "run_spec": {
      "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=01-ai_yi-6b,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "econometrics"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/yi-6b",
        "model": "01-ai/yi-6b",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_econometrics"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 679.7894736842105,
        "sum_squared": 462113.728531856,
        "min": 679.7894736842105,
        "max": 679.7894736842105,
        "mean": 679.7894736842105,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32449395196479663,
        "sum_squared": 0.10529632486173175,
        "min": 0.32449395196479663,
        "max": 0.32449395196479663,
        "mean": 0.32449395196479663,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.35964912280701755,
        "sum_squared": 0.1293474915358572,
        "min": 0.35964912280701755,
        "max": 0.35964912280701755,
        "mean": 0.35964912280701755,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.35964912280701755,
        "sum_squared": 0.1293474915358572,
        "min": 0.35964912280701755,
        "max": 0.35964912280701755,
        "mean": 0.35964912280701755,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.35964912280701755,
        "sum_squared": 0.1293474915358572,
        "min": 0.35964912280701755,
        "max": 0.35964912280701755,
        "mean": 0.35964912280701755,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.35964912280701755,
        "sum_squared": 0.1293474915358572,
        "min": 0.35964912280701755,
        "max": 0.35964912280701755,
        "mean": 0.35964912280701755,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 679.7894736842105,
        "sum_squared": 462113.728531856,
        "min": 679.7894736842105,
        "max": 679.7894736842105,
        "mean": 679.7894736842105,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 679.7894736842105,
        "sum_squared": 462113.728531856,
        "min": 679.7894736842105,
        "max": 679.7894736842105,
        "mean": 679.7894736842105,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32449395196479663,
        "sum_squared": 0.10529632486173175,
        "min": 0.32449395196479663,
        "max": 0.32449395196479663,
        "mean": 0.32449395196479663,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32449395196479663,
        "sum_squared": 0.10529632486173175,
        "min": 0.32449395196479663,
        "max": 0.32449395196479663,
        "mean": 0.32449395196479663,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.35964912280701755,
        "sum_squared": 0.1293474915358572,
        "min": 0.35964912280701755,
        "max": 0.35964912280701755,
        "mean": 0.35964912280701755,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.35964912280701755,
        "sum_squared": 0.1293474915358572,
        "min": 0.35964912280701755,
        "max": 0.35964912280701755,
        "mean": 0.35964912280701755,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.35964912280701755,
        "sum_squared": 0.1293474915358572,
        "min": 0.35964912280701755,
        "max": 0.35964912280701755,
        "mean": 0.35964912280701755,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.35964912280701755,
        "sum_squared": 0.1293474915358572,
        "min": 0.35964912280701755,
        "max": 0.35964912280701755,
        "mean": 0.35964912280701755,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.35964912280701755,
        "sum_squared": 0.1293474915358572,
        "min": 0.35964912280701755,
        "max": 0.35964912280701755,
        "mean": 0.35964912280701755,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.35964912280701755,
        "sum_squared": 0.1293474915358572,
        "min": 0.35964912280701755,
        "max": 0.35964912280701755,
        "mean": 0.35964912280701755,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.35964912280701755,
        "sum_squared": 0.1293474915358572,
        "min": 0.35964912280701755,
        "max": 0.35964912280701755,
        "mean": 0.35964912280701755,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.35964912280701755,
        "sum_squared": 0.1293474915358572,
        "min": 0.35964912280701755,
        "max": 0.35964912280701755,
        "mean": 0.35964912280701755,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 114.0,
        "sum_squared": 12996.0,
        "min": 114.0,
        "max": 114.0,
        "mean": 114.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-2.1,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "run_spec": {
      "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-2.1,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "econometrics"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "\n\nHuman:",
        "global_suffix": "\n\nAssistant:",
        "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-2.1",
        "model": "anthropic/claude-2.1",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_econometrics"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 630.5964912280701,
        "sum_squared": 397651.93474915356,
        "min": 630.5964912280701,
        "max": 630.5964912280701,
        "mean": 630.5964912280701,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.3978815350616187,
        "sum_squared": 1.9540727860662275,
        "min": 1.3978815350616187,
        "max": 1.3978815350616187,
        "mean": 1.3978815350616187,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.6842105263157894,
        "sum_squared": 2.836565096952908,
        "min": 1.6842105263157894,
        "max": 1.6842105263157894,
        "mean": 1.6842105263157894,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 630.5964912280701,
        "sum_squared": 397651.93474915356,
        "min": 630.5964912280701,
        "max": 630.5964912280701,
        "mean": 630.5964912280701,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 630.5964912280701,
        "sum_squared": 397651.93474915356,
        "min": 630.5964912280701,
        "max": 630.5964912280701,
        "mean": 630.5964912280701,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.3978815350616187,
        "sum_squared": 1.9540727860662275,
        "min": 1.3978815350616187,
        "max": 1.3978815350616187,
        "mean": 1.3978815350616187,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.3978815350616187,
        "sum_squared": 1.9540727860662275,
        "min": 1.3978815350616187,
        "max": 1.3978815350616187,
        "mean": 1.3978815350616187,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5614035087719298,
        "sum_squared": 0.3151738996614343,
        "min": 0.5614035087719298,
        "max": 0.5614035087719298,
        "mean": 0.5614035087719298,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.6842105263157894,
        "sum_squared": 2.836565096952908,
        "min": 1.6842105263157894,
        "max": 1.6842105263157894,
        "mean": 1.6842105263157894,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.6842105263157894,
        "sum_squared": 2.836565096952908,
        "min": 1.6842105263157894,
        "max": 1.6842105263157894,
        "mean": 1.6842105263157894,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 114.0,
        "sum_squared": 12996.0,
        "min": 114.0,
        "max": 114.0,
        "mean": 114.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "run_spec": {
      "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "econometrics"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-3-opus-20240229",
        "model": "anthropic/claude-3-opus-20240229",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_econometrics"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 622.5964912280701,
        "sum_squared": 387626.3908895044,
        "min": 622.5964912280701,
        "max": 622.5964912280701,
        "mean": 622.5964912280701,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 4.46739117095345,
        "sum_squared": 19.957583874312835,
        "min": 4.46739117095345,
        "max": 4.46739117095345,
        "mean": 4.46739117095345,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 622.5964912280701,
        "sum_squared": 387626.3908895044,
        "min": 622.5964912280701,
        "max": 622.5964912280701,
        "mean": 622.5964912280701,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 622.5964912280701,
        "sum_squared": 387626.3908895044,
        "min": 622.5964912280701,
        "max": 622.5964912280701,
        "mean": 622.5964912280701,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.46739117095345,
        "sum_squared": 19.957583874312835,
        "min": 4.46739117095345,
        "max": 4.46739117095345,
        "mean": 4.46739117095345,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.46739117095345,
        "sum_squared": 19.957583874312835,
        "min": 4.46739117095345,
        "max": 4.46739117095345,
        "mean": 4.46739117095345,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7719298245614035,
        "sum_squared": 0.5958756540473992,
        "min": 0.7719298245614035,
        "max": 0.7719298245614035,
        "mean": 0.7719298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 114.0,
        "sum_squared": 12996.0,
        "min": 114.0,
        "max": 114.0,
        "mean": 114.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "run_spec": {
      "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "econometrics"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-3-sonnet-20240229",
        "model": "anthropic/claude-3-sonnet-20240229",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_econometrics"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 622.5964912280701,
        "sum_squared": 387626.3908895044,
        "min": 622.5964912280701,
        "max": 622.5964912280701,
        "mean": 622.5964912280701,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.8370495072582311,
        "sum_squared": 3.3747508921177096,
        "min": 1.8370495072582311,
        "max": 1.8370495072582311,
        "mean": 1.8370495072582311,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0701754385964912,
        "sum_squared": 1.1452754693751923,
        "min": 1.0701754385964912,
        "max": 1.0701754385964912,
        "mean": 1.0701754385964912,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 622.5964912280701,
        "sum_squared": 387626.3908895044,
        "min": 622.5964912280701,
        "max": 622.5964912280701,
        "mean": 622.5964912280701,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 622.5964912280701,
        "sum_squared": 387626.3908895044,
        "min": 622.5964912280701,
        "max": 622.5964912280701,
        "mean": 622.5964912280701,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.8370495072582311,
        "sum_squared": 3.3747508921177096,
        "min": 1.8370495072582311,
        "max": 1.8370495072582311,
        "mean": 1.8370495072582311,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.8370495072582311,
        "sum_squared": 3.3747508921177096,
        "min": 1.8370495072582311,
        "max": 1.8370495072582311,
        "mean": 1.8370495072582311,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0701754385964912,
        "sum_squared": 1.1452754693751923,
        "min": 1.0701754385964912,
        "max": 1.0701754385964912,
        "mean": 1.0701754385964912,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0701754385964912,
        "sum_squared": 1.1452754693751923,
        "min": 1.0701754385964912,
        "max": 1.0701754385964912,
        "mean": 1.0701754385964912,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 114.0,
        "sum_squared": 12996.0,
        "min": 114.0,
        "max": 114.0,
        "mean": 114.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-instant-1.2,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "run_spec": {
      "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=anthropic_claude-instant-1.2,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "econometrics"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "\n\nHuman:",
        "global_suffix": "\n\nAssistant:",
        "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-instant-1.2",
        "model": "anthropic/claude-instant-1.2",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_econometrics"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 630.5964912280701,
        "sum_squared": 397651.93474915356,
        "min": 630.5964912280701,
        "max": 630.5964912280701,
        "mean": 630.5964912280701,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.116357355787043,
        "sum_squared": 1.2462537458198384,
        "min": 1.116357355787043,
        "max": 1.116357355787043,
        "mean": 1.116357355787043,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.37719298245614036,
        "sum_squared": 0.1422745460141582,
        "min": 0.37719298245614036,
        "max": 0.37719298245614036,
        "mean": 0.37719298245614036,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.37719298245614036,
        "sum_squared": 0.1422745460141582,
        "min": 0.37719298245614036,
        "max": 0.37719298245614036,
        "mean": 0.37719298245614036,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.37719298245614036,
        "sum_squared": 0.1422745460141582,
        "min": 0.37719298245614036,
        "max": 0.37719298245614036,
        "mean": 0.37719298245614036,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.37719298245614036,
        "sum_squared": 0.1422745460141582,
        "min": 0.37719298245614036,
        "max": 0.37719298245614036,
        "mean": 0.37719298245614036,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 630.5964912280701,
        "sum_squared": 397651.93474915356,
        "min": 630.5964912280701,
        "max": 630.5964912280701,
        "mean": 630.5964912280701,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 630.5964912280701,
        "sum_squared": 397651.93474915356,
        "min": 630.5964912280701,
        "max": 630.5964912280701,
        "mean": 630.5964912280701,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.116357355787043,
        "sum_squared": 1.2462537458198384,
        "min": 1.116357355787043,
        "max": 1.116357355787043,
        "mean": 1.116357355787043,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.116357355787043,
        "sum_squared": 1.2462537458198384,
        "min": 1.116357355787043,
        "max": 1.116357355787043,
        "mean": 1.116357355787043,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3684210526315789,
        "sum_squared": 0.13573407202216065,
        "min": 0.3684210526315789,
        "max": 0.3684210526315789,
        "mean": 0.3684210526315789,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.37719298245614036,
        "sum_squared": 0.1422745460141582,
        "min": 0.37719298245614036,
        "max": 0.37719298245614036,
        "mean": 0.37719298245614036,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.37719298245614036,
        "sum_squared": 0.1422745460141582,
        "min": 0.37719298245614036,
        "max": 0.37719298245614036,
        "mean": 0.37719298245614036,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.37719298245614036,
        "sum_squared": 0.1422745460141582,
        "min": 0.37719298245614036,
        "max": 0.37719298245614036,
        "mean": 0.37719298245614036,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.37719298245614036,
        "sum_squared": 0.1422745460141582,
        "min": 0.37719298245614036,
        "max": 0.37719298245614036,
        "mean": 0.37719298245614036,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.37719298245614036,
        "sum_squared": 0.1422745460141582,
        "min": 0.37719298245614036,
        "max": 0.37719298245614036,
        "mean": 0.37719298245614036,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.37719298245614036,
        "sum_squared": 0.1422745460141582,
        "min": 0.37719298245614036,
        "max": 0.37719298245614036,
        "mean": 0.37719298245614036,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.37719298245614036,
        "sum_squared": 0.1422745460141582,
        "min": 0.37719298245614036,
        "max": 0.37719298245614036,
        "mean": 0.37719298245614036,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.37719298245614036,
        "sum_squared": 0.1422745460141582,
        "min": 0.37719298245614036,
        "max": 0.37719298245614036,
        "mean": 0.37719298245614036,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 114.0,
        "sum_squared": 12996.0,
        "min": 114.0,
        "max": 114.0,
        "mean": 114.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-pro,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "run_spec": {
      "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemini-pro,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "econometrics"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/gemini-pro",
        "model": "google/gemini-pro",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_econometrics"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 638.5526315789474,
        "sum_squared": 407749.4632963989,
        "min": 638.5526315789474,
        "max": 638.5526315789474,
        "mean": 638.5526315789474,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7309655264804238,
        "sum_squared": 0.5343106009028031,
        "min": 0.7309655264804238,
        "max": 0.7309655264804238,
        "mean": 0.7309655264804238,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 638.5526315789474,
        "sum_squared": 407749.4632963989,
        "min": 638.5526315789474,
        "max": 638.5526315789474,
        "mean": 638.5526315789474,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 638.5526315789474,
        "sum_squared": 407749.4632963989,
        "min": 638.5526315789474,
        "max": 638.5526315789474,
        "mean": 638.5526315789474,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7309655264804238,
        "sum_squared": 0.5343106009028031,
        "min": 0.7309655264804238,
        "max": 0.7309655264804238,
        "mean": 0.7309655264804238,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7309655264804238,
        "sum_squared": 0.5343106009028031,
        "min": 0.7309655264804238,
        "max": 0.7309655264804238,
        "mean": 0.7309655264804238,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5789473684210527,
        "sum_squared": 0.33518005540166207,
        "min": 0.5789473684210527,
        "max": 0.5789473684210527,
        "mean": 0.5789473684210527,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 114.0,
        "sum_squared": 12996.0,
        "min": 114.0,
        "max": 114.0,
        "mean": 114.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "run_spec": {
      "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "econometrics"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/gemma-7b",
        "model": "google/gemma-7b",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_econometrics"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 638.5526315789474,
        "sum_squared": 407749.4632963989,
        "min": 638.5526315789474,
        "max": 638.5526315789474,
        "mean": 638.5526315789474,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.27136534766147014,
        "sum_squared": 0.07363915191143056,
        "min": 0.27136534766147014,
        "max": 0.27136534766147014,
        "mean": 0.27136534766147014,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 638.5526315789474,
        "sum_squared": 407749.4632963989,
        "min": 638.5526315789474,
        "max": 638.5526315789474,
        "mean": 638.5526315789474,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 638.5526315789474,
        "sum_squared": 407749.4632963989,
        "min": 638.5526315789474,
        "max": 638.5526315789474,
        "mean": 638.5526315789474,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27136534766147014,
        "sum_squared": 0.07363915191143056,
        "min": 0.27136534766147014,
        "max": 0.27136534766147014,
        "mean": 0.27136534766147014,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.27136534766147014,
        "sum_squared": 0.07363915191143056,
        "min": 0.27136534766147014,
        "max": 0.27136534766147014,
        "mean": 0.27136534766147014,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 114.0,
        "sum_squared": 12996.0,
        "min": 114.0,
        "max": 114.0,
        "mean": 114.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-7b-it,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "run_spec": {
      "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_gemma-7b-it,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "econometrics"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/gemma-7b-it",
        "model": "google/gemma-7b-it",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_econometrics"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 638.5526315789474,
        "sum_squared": 407749.4632963989,
        "min": 638.5526315789474,
        "max": 638.5526315789474,
        "mean": 638.5526315789474,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.26876863889526903,
        "sum_squared": 0.07223658125361553,
        "min": 0.26876863889526903,
        "max": 0.26876863889526903,
        "mean": 0.26876863889526903,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2543859649122807,
        "sum_squared": 0.06471221914435211,
        "min": 0.2543859649122807,
        "max": 0.2543859649122807,
        "mean": 0.2543859649122807,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2543859649122807,
        "sum_squared": 0.06471221914435211,
        "min": 0.2543859649122807,
        "max": 0.2543859649122807,
        "mean": 0.2543859649122807,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2543859649122807,
        "sum_squared": 0.06471221914435211,
        "min": 0.2543859649122807,
        "max": 0.2543859649122807,
        "mean": 0.2543859649122807,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2543859649122807,
        "sum_squared": 0.06471221914435211,
        "min": 0.2543859649122807,
        "max": 0.2543859649122807,
        "mean": 0.2543859649122807,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2631578947368421,
        "sum_squared": 0.06925207756232686,
        "min": 0.2631578947368421,
        "max": 0.2631578947368421,
        "mean": 0.2631578947368421,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2631578947368421,
        "sum_squared": 0.06925207756232686,
        "min": 0.2631578947368421,
        "max": 0.2631578947368421,
        "mean": 0.2631578947368421,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2631578947368421,
        "sum_squared": 0.06925207756232686,
        "min": 0.2631578947368421,
        "max": 0.2631578947368421,
        "mean": 0.2631578947368421,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2631578947368421,
        "sum_squared": 0.06925207756232686,
        "min": 0.2631578947368421,
        "max": 0.2631578947368421,
        "mean": 0.2631578947368421,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 638.5526315789474,
        "sum_squared": 407749.4632963989,
        "min": 638.5526315789474,
        "max": 638.5526315789474,
        "mean": 638.5526315789474,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 638.5526315789474,
        "sum_squared": 407749.4632963989,
        "min": 638.5526315789474,
        "max": 638.5526315789474,
        "mean": 638.5526315789474,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.26876863889526903,
        "sum_squared": 0.07223658125361553,
        "min": 0.26876863889526903,
        "max": 0.26876863889526903,
        "mean": 0.26876863889526903,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.26876863889526903,
        "sum_squared": 0.07223658125361553,
        "min": 0.26876863889526903,
        "max": 0.26876863889526903,
        "mean": 0.26876863889526903,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2543859649122807,
        "sum_squared": 0.06471221914435211,
        "min": 0.2543859649122807,
        "max": 0.2543859649122807,
        "mean": 0.2543859649122807,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2543859649122807,
        "sum_squared": 0.06471221914435211,
        "min": 0.2543859649122807,
        "max": 0.2543859649122807,
        "mean": 0.2543859649122807,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2543859649122807,
        "sum_squared": 0.06471221914435211,
        "min": 0.2543859649122807,
        "max": 0.2543859649122807,
        "mean": 0.2543859649122807,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2543859649122807,
        "sum_squared": 0.06471221914435211,
        "min": 0.2543859649122807,
        "max": 0.2543859649122807,
        "mean": 0.2543859649122807,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2543859649122807,
        "sum_squared": 0.06471221914435211,
        "min": 0.2543859649122807,
        "max": 0.2543859649122807,
        "mean": 0.2543859649122807,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2543859649122807,
        "sum_squared": 0.06471221914435211,
        "min": 0.2543859649122807,
        "max": 0.2543859649122807,
        "mean": 0.2543859649122807,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2543859649122807,
        "sum_squared": 0.06471221914435211,
        "min": 0.2543859649122807,
        "max": 0.2543859649122807,
        "mean": 0.2543859649122807,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2543859649122807,
        "sum_squared": 0.06471221914435211,
        "min": 0.2543859649122807,
        "max": 0.2543859649122807,
        "mean": 0.2543859649122807,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2631578947368421,
        "sum_squared": 0.06925207756232686,
        "min": 0.2631578947368421,
        "max": 0.2631578947368421,
        "mean": 0.2631578947368421,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2631578947368421,
        "sum_squared": 0.06925207756232686,
        "min": 0.2631578947368421,
        "max": 0.2631578947368421,
        "mean": 0.2631578947368421,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2631578947368421,
        "sum_squared": 0.06925207756232686,
        "min": 0.2631578947368421,
        "max": 0.2631578947368421,
        "mean": 0.2631578947368421,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2631578947368421,
        "sum_squared": 0.06925207756232686,
        "min": 0.2631578947368421,
        "max": 0.2631578947368421,
        "mean": 0.2631578947368421,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2631578947368421,
        "sum_squared": 0.06925207756232686,
        "min": 0.2631578947368421,
        "max": 0.2631578947368421,
        "mean": 0.2631578947368421,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2631578947368421,
        "sum_squared": 0.06925207756232686,
        "min": 0.2631578947368421,
        "max": 0.2631578947368421,
        "mean": 0.2631578947368421,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2631578947368421,
        "sum_squared": 0.06925207756232686,
        "min": 0.2631578947368421,
        "max": 0.2631578947368421,
        "mean": 0.2631578947368421,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2631578947368421,
        "sum_squared": 0.06925207756232686,
        "min": 0.2631578947368421,
        "max": 0.2631578947368421,
        "mean": 0.2631578947368421,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 114.0,
        "sum_squared": 12996.0,
        "min": 114.0,
        "max": 114.0,
        "mean": 114.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=econometrics,method=multiple_choice_joint,model=google_text-bison@001,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "run_spec": {
      "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_text-bison@001,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "econometrics"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/text-bison@001",
        "model": "google/text-bison@001",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_econometrics"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 656.3947368421053,
        "sum_squared": 430854.05055401666,
        "min": 656.3947368421053,
        "max": 656.3947368421053,
        "mean": 656.3947368421053,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.019343238127859,
        "sum_squared": 1.0390606371169893,
        "min": 1.019343238127859,
        "max": 1.019343238127859,
        "mean": 1.019343238127859,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 656.3947368421053,
        "sum_squared": 430854.05055401666,
        "min": 656.3947368421053,
        "max": 656.3947368421053,
        "mean": 656.3947368421053,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 656.3947368421053,
        "sum_squared": 430854.05055401666,
        "min": 656.3947368421053,
        "max": 656.3947368421053,
        "mean": 656.3947368421053,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.019343238127859,
        "sum_squared": 1.0390606371169893,
        "min": 1.019343238127859,
        "max": 1.019343238127859,
        "mean": 1.019343238127859,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.019343238127859,
        "sum_squared": 1.0390606371169893,
        "min": 1.019343238127859,
        "max": 1.019343238127859,
        "mean": 1.019343238127859,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 114.0,
        "sum_squared": 12996.0,
        "min": 114.0,
        "max": 114.0,
        "mean": 114.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=econometrics,method=multiple_choice_joint,model=google_text-unicorn@001,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "run_spec": {
      "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=google_text-unicorn@001,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "econometrics"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/text-unicorn@001",
        "model": "google/text-unicorn@001",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_econometrics"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 656.3947368421053,
        "sum_squared": 430854.05055401666,
        "min": 656.3947368421053,
        "max": 656.3947368421053,
        "mean": 656.3947368421053,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.8350292967076887,
        "sum_squared": 0.6972739263601372,
        "min": 0.8350292967076887,
        "max": 0.8350292967076887,
        "mean": 0.8350292967076887,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 656.3947368421053,
        "sum_squared": 430854.05055401666,
        "min": 656.3947368421053,
        "max": 656.3947368421053,
        "mean": 656.3947368421053,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 656.3947368421053,
        "sum_squared": 430854.05055401666,
        "min": 656.3947368421053,
        "max": 656.3947368421053,
        "mean": 656.3947368421053,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8350292967076887,
        "sum_squared": 0.6972739263601372,
        "min": 0.8350292967076887,
        "max": 0.8350292967076887,
        "mean": 0.8350292967076887,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8350292967076887,
        "sum_squared": 0.6972739263601372,
        "min": 0.8350292967076887,
        "max": 0.8350292967076887,
        "mean": 0.8350292967076887,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6491228070175439,
        "sum_squared": 0.4213604185903355,
        "min": 0.6491228070175439,
        "max": 0.6491228070175439,
        "mean": 0.6491228070175439,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 114.0,
        "sum_squared": 12996.0,
        "min": 114.0,
        "max": 114.0,
        "mean": 114.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "run_spec": {
      "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=meta_llama-2-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "econometrics"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-2-7b",
        "model": "meta/llama-2-7b",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_econometrics"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 696.6754385964912,
        "sum_squared": 485356.6667436134,
        "min": 696.6754385964912,
        "max": 696.6754385964912,
        "mean": 696.6754385964912,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.30400960487231876,
        "sum_squared": 0.09242183985462338,
        "min": 0.30400960487231876,
        "max": 0.30400960487231876,
        "mean": 0.30400960487231876,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32456140350877194,
        "sum_squared": 0.10534010464758388,
        "min": 0.32456140350877194,
        "max": 0.32456140350877194,
        "mean": 0.32456140350877194,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32456140350877194,
        "sum_squared": 0.10534010464758388,
        "min": 0.32456140350877194,
        "max": 0.32456140350877194,
        "mean": 0.32456140350877194,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32456140350877194,
        "sum_squared": 0.10534010464758388,
        "min": 0.32456140350877194,
        "max": 0.32456140350877194,
        "mean": 0.32456140350877194,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32456140350877194,
        "sum_squared": 0.10534010464758388,
        "min": 0.32456140350877194,
        "max": 0.32456140350877194,
        "mean": 0.32456140350877194,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3333333333333333,
        "sum_squared": 0.1111111111111111,
        "min": 0.3333333333333333,
        "max": 0.3333333333333333,
        "mean": 0.3333333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3333333333333333,
        "sum_squared": 0.1111111111111111,
        "min": 0.3333333333333333,
        "max": 0.3333333333333333,
        "mean": 0.3333333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3333333333333333,
        "sum_squared": 0.1111111111111111,
        "min": 0.3333333333333333,
        "max": 0.3333333333333333,
        "mean": 0.3333333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3333333333333333,
        "sum_squared": 0.1111111111111111,
        "min": 0.3333333333333333,
        "max": 0.3333333333333333,
        "mean": 0.3333333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 696.6754385964912,
        "sum_squared": 485356.6667436134,
        "min": 696.6754385964912,
        "max": 696.6754385964912,
        "mean": 696.6754385964912,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 696.6754385964912,
        "sum_squared": 485356.6667436134,
        "min": 696.6754385964912,
        "max": 696.6754385964912,
        "mean": 696.6754385964912,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.30400960487231876,
        "sum_squared": 0.09242183985462338,
        "min": 0.30400960487231876,
        "max": 0.30400960487231876,
        "mean": 0.30400960487231876,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.30400960487231876,
        "sum_squared": 0.09242183985462338,
        "min": 0.30400960487231876,
        "max": 0.30400960487231876,
        "mean": 0.30400960487231876,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32456140350877194,
        "sum_squared": 0.10534010464758388,
        "min": 0.32456140350877194,
        "max": 0.32456140350877194,
        "mean": 0.32456140350877194,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32456140350877194,
        "sum_squared": 0.10534010464758388,
        "min": 0.32456140350877194,
        "max": 0.32456140350877194,
        "mean": 0.32456140350877194,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32456140350877194,
        "sum_squared": 0.10534010464758388,
        "min": 0.32456140350877194,
        "max": 0.32456140350877194,
        "mean": 0.32456140350877194,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32456140350877194,
        "sum_squared": 0.10534010464758388,
        "min": 0.32456140350877194,
        "max": 0.32456140350877194,
        "mean": 0.32456140350877194,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32456140350877194,
        "sum_squared": 0.10534010464758388,
        "min": 0.32456140350877194,
        "max": 0.32456140350877194,
        "mean": 0.32456140350877194,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32456140350877194,
        "sum_squared": 0.10534010464758388,
        "min": 0.32456140350877194,
        "max": 0.32456140350877194,
        "mean": 0.32456140350877194,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32456140350877194,
        "sum_squared": 0.10534010464758388,
        "min": 0.32456140350877194,
        "max": 0.32456140350877194,
        "mean": 0.32456140350877194,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32456140350877194,
        "sum_squared": 0.10534010464758388,
        "min": 0.32456140350877194,
        "max": 0.32456140350877194,
        "mean": 0.32456140350877194,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3333333333333333,
        "sum_squared": 0.1111111111111111,
        "min": 0.3333333333333333,
        "max": 0.3333333333333333,
        "mean": 0.3333333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3333333333333333,
        "sum_squared": 0.1111111111111111,
        "min": 0.3333333333333333,
        "max": 0.3333333333333333,
        "mean": 0.3333333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3333333333333333,
        "sum_squared": 0.1111111111111111,
        "min": 0.3333333333333333,
        "max": 0.3333333333333333,
        "mean": 0.3333333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3333333333333333,
        "sum_squared": 0.1111111111111111,
        "min": 0.3333333333333333,
        "max": 0.3333333333333333,
        "mean": 0.3333333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3333333333333333,
        "sum_squared": 0.1111111111111111,
        "min": 0.3333333333333333,
        "max": 0.3333333333333333,
        "mean": 0.3333333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3333333333333333,
        "sum_squared": 0.1111111111111111,
        "min": 0.3333333333333333,
        "max": 0.3333333333333333,
        "mean": 0.3333333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3333333333333333,
        "sum_squared": 0.1111111111111111,
        "min": 0.3333333333333333,
        "max": 0.3333333333333333,
        "mean": 0.3333333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3333333333333333,
        "sum_squared": 0.1111111111111111,
        "min": 0.3333333333333333,
        "max": 0.3333333333333333,
        "mean": 0.3333333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 114.0,
        "sum_squared": 12996.0,
        "min": 114.0,
        "max": 114.0,
        "mean": 114.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-2,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "run_spec": {
      "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=microsoft_phi-2,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "econometrics"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/phi-2",
        "model": "microsoft/phi-2",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_econometrics"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 636.0701754385965,
        "sum_squared": 404585.26808248693,
        "min": 636.0701754385965,
        "max": 636.0701754385965,
        "mean": 636.0701754385965,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2779764794466788,
        "sum_squared": 0.07727092312556984,
        "min": 0.2779764794466788,
        "max": 0.2779764794466788,
        "mean": 0.2779764794466788,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 636.0701754385965,
        "sum_squared": 404585.26808248693,
        "min": 636.0701754385965,
        "max": 636.0701754385965,
        "mean": 636.0701754385965,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 636.0701754385965,
        "sum_squared": 404585.26808248693,
        "min": 636.0701754385965,
        "max": 636.0701754385965,
        "mean": 636.0701754385965,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2779764794466788,
        "sum_squared": 0.07727092312556984,
        "min": 0.2779764794466788,
        "max": 0.2779764794466788,
        "mean": 0.2779764794466788,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2779764794466788,
        "sum_squared": 0.07727092312556984,
        "min": 0.2779764794466788,
        "max": 0.2779764794466788,
        "mean": 0.2779764794466788,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3508771929824561,
        "sum_squared": 0.12311480455524776,
        "min": 0.3508771929824561,
        "max": 0.3508771929824561,
        "mean": 0.3508771929824561,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 114.0,
        "sum_squared": 12996.0,
        "min": 114.0,
        "max": 114.0,
        "mean": 114.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "run_spec": {
      "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "econometrics"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/mixtral-8x7b-32kseqlen",
        "model": "mistralai/mixtral-8x7b-32kseqlen",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_econometrics"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 699.1754385964912,
        "sum_squared": 488846.2939365958,
        "min": 699.1754385964912,
        "max": 699.1754385964912,
        "mean": 699.1754385964912,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.33104314302143295,
        "sum_squared": 0.10958956254150891,
        "min": 0.33104314302143295,
        "max": 0.33104314302143295,
        "mean": 0.33104314302143295,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 699.1754385964912,
        "sum_squared": 488846.2939365958,
        "min": 699.1754385964912,
        "max": 699.1754385964912,
        "mean": 699.1754385964912,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 699.1754385964912,
        "sum_squared": 488846.2939365958,
        "min": 699.1754385964912,
        "max": 699.1754385964912,
        "mean": 699.1754385964912,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.33104314302143295,
        "sum_squared": 0.10958956254150891,
        "min": 0.33104314302143295,
        "max": 0.33104314302143295,
        "mean": 0.33104314302143295,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.33104314302143295,
        "sum_squared": 0.10958956254150891,
        "min": 0.33104314302143295,
        "max": 0.33104314302143295,
        "mean": 0.33104314302143295,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6228070175438597,
        "sum_squared": 0.38788858110187757,
        "min": 0.6228070175438597,
        "max": 0.6228070175438597,
        "mean": 0.6228070175438597,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 114.0,
        "sum_squared": 12996.0,
        "min": 114.0,
        "max": 114.0,
        "mean": 114.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "run_spec": {
      "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "econometrics"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-3.5-turbo-0613",
        "model": "openai/gpt-3.5-turbo-0613",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_econometrics"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 619.4298245614035,
        "sum_squared": 383693.30755617114,
        "min": 619.4298245614035,
        "max": 619.4298245614035,
        "mean": 619.4298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.4066693531839471,
        "sum_squared": 0.1653799628190499,
        "min": 0.4066693531839471,
        "max": 0.4066693531839471,
        "mean": 0.4066693531839471,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 619.4298245614035,
        "sum_squared": 383693.30755617114,
        "min": 619.4298245614035,
        "max": 619.4298245614035,
        "mean": 619.4298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 619.4298245614035,
        "sum_squared": 383693.30755617114,
        "min": 619.4298245614035,
        "max": 619.4298245614035,
        "mean": 619.4298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4066693531839471,
        "sum_squared": 0.1653799628190499,
        "min": 0.4066693531839471,
        "max": 0.4066693531839471,
        "mean": 0.4066693531839471,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4066693531839471,
        "sum_squared": 0.1653799628190499,
        "min": 0.4066693531839471,
        "max": 0.4066693531839471,
        "mean": 0.4066693531839471,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5,
        "sum_squared": 0.25,
        "min": 0.5,
        "max": 0.5,
        "mean": 0.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5087719298245614,
        "sum_squared": 0.25884887657740846,
        "min": 0.5087719298245614,
        "max": 0.5087719298245614,
        "mean": 0.5087719298245614,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 114.0,
        "sum_squared": 12996.0,
        "min": 114.0,
        "max": 114.0,
        "mean": 114.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-1106-preview,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "run_spec": {
      "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=openai_gpt-4-1106-preview,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "econometrics"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-4-1106-preview",
        "model": "openai/gpt-4-1106-preview",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_econometrics"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 619.4298245614035,
        "sum_squared": 383693.30755617114,
        "min": 619.4298245614035,
        "max": 619.4298245614035,
        "mean": 619.4298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5296852944190042,
        "sum_squared": 0.2805665111237472,
        "min": 0.5296852944190042,
        "max": 0.5296852944190042,
        "mean": 0.5296852944190042,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 619.4298245614035,
        "sum_squared": 383693.30755617114,
        "min": 619.4298245614035,
        "max": 619.4298245614035,
        "mean": 619.4298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 619.4298245614035,
        "sum_squared": 383693.30755617114,
        "min": 619.4298245614035,
        "max": 619.4298245614035,
        "mean": 619.4298245614035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5296852944190042,
        "sum_squared": 0.2805665111237472,
        "min": 0.5296852944190042,
        "max": 0.5296852944190042,
        "mean": 0.5296852944190042,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5296852944190042,
        "sum_squared": 0.2805665111237472,
        "min": 0.5296852944190042,
        "max": 0.5296852944190042,
        "mean": 0.5296852944190042,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7105263157894737,
        "sum_squared": 0.5048476454293629,
        "min": 0.7105263157894737,
        "max": 0.7105263157894737,
        "mean": 0.7105263157894737,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 114.0,
        "sum_squared": 12996.0,
        "min": 114.0,
        "max": 114.0,
        "mean": 114.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
    "run_spec": {
      "name": "mmlu:subject=econometrics,method=multiple_choice_joint,model=qwen_qwen1.5-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_econometrics",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "econometrics"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about econometrics. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/qwen1.5-7b",
        "model": "qwen/qwen1.5-7b",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_econometrics"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 632.938596491228,
        "sum_squared": 400611.2669282856,
        "min": 632.938596491228,
        "max": 632.938596491228,
        "mean": 632.938596491228,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2751412308006956,
        "sum_squared": 0.07570269688652163,
        "min": 0.2751412308006956,
        "max": 0.2751412308006956,
        "mean": 0.2751412308006956,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.45614035087719296,
        "sum_squared": 0.2080640196983687,
        "min": 0.45614035087719296,
        "max": 0.45614035087719296,
        "mean": 0.45614035087719296,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.45614035087719296,
        "sum_squared": 0.2080640196983687,
        "min": 0.45614035087719296,
        "max": 0.45614035087719296,
        "mean": 0.45614035087719296,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.45614035087719296,
        "sum_squared": 0.2080640196983687,
        "min": 0.45614035087719296,
        "max": 0.45614035087719296,
        "mean": 0.45614035087719296,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.45614035087719296,
        "sum_squared": 0.2080640196983687,
        "min": 0.45614035087719296,
        "max": 0.45614035087719296,
        "mean": 0.45614035087719296,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 632.938596491228,
        "sum_squared": 400611.2669282856,
        "min": 632.938596491228,
        "max": 632.938596491228,
        "mean": 632.938596491228,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 632.938596491228,
        "sum_squared": 400611.2669282856,
        "min": 632.938596491228,
        "max": 632.938596491228,
        "mean": 632.938596491228,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2751412308006956,
        "sum_squared": 0.07570269688652163,
        "min": 0.2751412308006956,
        "max": 0.2751412308006956,
        "mean": 0.2751412308006956,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2751412308006956,
        "sum_squared": 0.07570269688652163,
        "min": 0.2751412308006956,
        "max": 0.2751412308006956,
        "mean": 0.2751412308006956,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.45614035087719296,
        "sum_squared": 0.2080640196983687,
        "min": 0.45614035087719296,
        "max": 0.45614035087719296,
        "mean": 0.45614035087719296,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.45614035087719296,
        "sum_squared": 0.2080640196983687,
        "min": 0.45614035087719296,
        "max": 0.45614035087719296,
        "mean": 0.45614035087719296,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.45614035087719296,
        "sum_squared": 0.2080640196983687,
        "min": 0.45614035087719296,
        "max": 0.45614035087719296,
        "mean": 0.45614035087719296,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.45614035087719296,
        "sum_squared": 0.2080640196983687,
        "min": 0.45614035087719296,
        "max": 0.45614035087719296,
        "mean": 0.45614035087719296,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.45614035087719296,
        "sum_squared": 0.2080640196983687,
        "min": 0.45614035087719296,
        "max": 0.45614035087719296,
        "mean": 0.45614035087719296,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.45614035087719296,
        "sum_squared": 0.2080640196983687,
        "min": 0.45614035087719296,
        "max": 0.45614035087719296,
        "mean": 0.45614035087719296,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.45614035087719296,
        "sum_squared": 0.2080640196983687,
        "min": 0.45614035087719296,
        "max": 0.45614035087719296,
        "mean": 0.45614035087719296,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.45614035087719296,
        "sum_squared": 0.2080640196983687,
        "min": 0.45614035087719296,
        "max": 0.45614035087719296,
        "mean": 0.45614035087719296,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47368421052631576,
        "sum_squared": 0.22437673130193903,
        "min": 0.47368421052631576,
        "max": 0.47368421052631576,
        "mean": 0.47368421052631576,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 114.0,
        "sum_squared": 12996.0,
        "min": 114.0,
        "max": 114.0,
        "mean": 114.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-6b,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "run_spec": {
      "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=01-ai_yi-6b,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "us_foreign_policy"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/yi-6b",
        "model": "01-ai/yi-6b",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_us_foreign_policy"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 470.53,
        "sum_squared": 221398.48089999997,
        "min": 470.53,
        "max": 470.53,
        "mean": 470.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.32152209281921384,
        "sum_squared": 0.10337645617084716,
        "min": 0.32152209281921384,
        "max": 0.32152209281921384,
        "mean": 0.32152209281921384,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 470.53,
        "sum_squared": 221398.48089999997,
        "min": 470.53,
        "max": 470.53,
        "mean": 470.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 470.53,
        "sum_squared": 221398.48089999997,
        "min": 470.53,
        "max": 470.53,
        "mean": 470.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32152209281921384,
        "sum_squared": 0.10337645617084716,
        "min": 0.32152209281921384,
        "max": 0.32152209281921384,
        "mean": 0.32152209281921384,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32152209281921384,
        "sum_squared": 0.10337645617084716,
        "min": 0.32152209281921384,
        "max": 0.32152209281921384,
        "mean": 0.32152209281921384,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-2.1,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "run_spec": {
      "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-2.1,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "us_foreign_policy"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "\n\nHuman:",
        "global_suffix": "\n\nAssistant:",
        "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-2.1",
        "model": "anthropic/claude-2.1",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_us_foreign_policy"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 466.25,
        "sum_squared": 217389.0625,
        "min": 466.25,
        "max": 466.25,
        "mean": 466.25,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.1599264645576477,
        "sum_squared": 1.345429403181204,
        "min": 1.1599264645576477,
        "max": 1.1599264645576477,
        "mean": 1.1599264645576477,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 6.76,
        "sum_squared": 45.697599999999994,
        "min": 6.76,
        "max": 6.76,
        "mean": 6.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 466.25,
        "sum_squared": 217389.0625,
        "min": 466.25,
        "max": 466.25,
        "mean": 466.25,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 466.25,
        "sum_squared": 217389.0625,
        "min": 466.25,
        "max": 466.25,
        "mean": 466.25,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.1599264645576477,
        "sum_squared": 1.345429403181204,
        "min": 1.1599264645576477,
        "max": 1.1599264645576477,
        "mean": 1.1599264645576477,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.1599264645576477,
        "sum_squared": 1.345429403181204,
        "min": 1.1599264645576477,
        "max": 1.1599264645576477,
        "mean": 1.1599264645576477,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 6.76,
        "sum_squared": 45.697599999999994,
        "min": 6.76,
        "max": 6.76,
        "mean": 6.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 6.76,
        "sum_squared": 45.697599999999994,
        "min": 6.76,
        "max": 6.76,
        "mean": 6.76,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "run_spec": {
      "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-opus-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "us_foreign_policy"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-3-opus-20240229",
        "model": "anthropic/claude-3-opus-20240229",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_us_foreign_policy"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 458.25,
        "sum_squared": 209993.0625,
        "min": 458.25,
        "max": 458.25,
        "mean": 458.25,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 4.53159328699112,
        "sum_squared": 20.53533771870298,
        "min": 4.53159328699112,
        "max": 4.53159328699112,
        "mean": 4.53159328699112,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 458.25,
        "sum_squared": 209993.0625,
        "min": 458.25,
        "max": 458.25,
        "mean": 458.25,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 458.25,
        "sum_squared": 209993.0625,
        "min": 458.25,
        "max": 458.25,
        "mean": 458.25,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.53159328699112,
        "sum_squared": 20.53533771870298,
        "min": 4.53159328699112,
        "max": 4.53159328699112,
        "mean": 4.53159328699112,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.53159328699112,
        "sum_squared": 20.53533771870298,
        "min": 4.53159328699112,
        "max": 4.53159328699112,
        "mean": 4.53159328699112,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.89,
        "sum_squared": 0.7921,
        "min": 0.89,
        "max": 0.89,
        "mean": 0.89,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "run_spec": {
      "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-3-sonnet-20240229,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "us_foreign_policy"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-3-sonnet-20240229",
        "model": "anthropic/claude-3-sonnet-20240229",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_us_foreign_policy"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 458.25,
        "sum_squared": 209993.0625,
        "min": 458.25,
        "max": 458.25,
        "mean": 458.25,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.6285528326034546,
        "sum_squared": 2.6521843285807356,
        "min": 1.6285528326034546,
        "max": 1.6285528326034546,
        "mean": 1.6285528326034546,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 458.25,
        "sum_squared": 209993.0625,
        "min": 458.25,
        "max": 458.25,
        "mean": 458.25,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 458.25,
        "sum_squared": 209993.0625,
        "min": 458.25,
        "max": 458.25,
        "mean": 458.25,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.6285528326034546,
        "sum_squared": 2.6521843285807356,
        "min": 1.6285528326034546,
        "max": 1.6285528326034546,
        "mean": 1.6285528326034546,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.6285528326034546,
        "sum_squared": 2.6521843285807356,
        "min": 1.6285528326034546,
        "max": 1.6285528326034546,
        "mean": 1.6285528326034546,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.96,
        "sum_squared": 0.9216,
        "min": 0.96,
        "max": 0.96,
        "mean": 0.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-instant-1.2,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "run_spec": {
      "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=anthropic_claude-instant-1.2,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "us_foreign_policy"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "\n\nHuman:",
        "global_suffix": "\n\nAssistant:",
        "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-instant-1.2",
        "model": "anthropic/claude-instant-1.2",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_us_foreign_policy"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 466.25,
        "sum_squared": 217389.0625,
        "min": 466.25,
        "max": 466.25,
        "mean": 466.25,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.8295096683502198,
        "sum_squared": 0.6880862898864916,
        "min": 0.8295096683502198,
        "max": 0.8295096683502198,
        "mean": 0.8295096683502198,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 466.25,
        "sum_squared": 217389.0625,
        "min": 466.25,
        "max": 466.25,
        "mean": 466.25,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 466.25,
        "sum_squared": 217389.0625,
        "min": 466.25,
        "max": 466.25,
        "mean": 466.25,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8295096683502198,
        "sum_squared": 0.6880862898864916,
        "min": 0.8295096683502198,
        "max": 0.8295096683502198,
        "mean": 0.8295096683502198,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8295096683502198,
        "sum_squared": 0.6880862898864916,
        "min": 0.8295096683502198,
        "max": 0.8295096683502198,
        "mean": 0.8295096683502198,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-pro,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "run_spec": {
      "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemini-pro,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "us_foreign_policy"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/gemini-pro",
        "model": "google/gemini-pro",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_us_foreign_policy"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 442.2,
        "sum_squared": 195540.84,
        "min": 442.2,
        "max": 442.2,
        "mean": 442.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5395235586166381,
        "sum_squared": 0.291085670302361,
        "min": 0.5395235586166381,
        "max": 0.5395235586166381,
        "mean": 0.5395235586166381,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 442.2,
        "sum_squared": 195540.84,
        "min": 442.2,
        "max": 442.2,
        "mean": 442.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 442.2,
        "sum_squared": 195540.84,
        "min": 442.2,
        "max": 442.2,
        "mean": 442.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5395235586166381,
        "sum_squared": 0.291085670302361,
        "min": 0.5395235586166381,
        "max": 0.5395235586166381,
        "mean": 0.5395235586166381,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5395235586166381,
        "sum_squared": 0.291085670302361,
        "min": 0.5395235586166381,
        "max": 0.5395235586166381,
        "mean": 0.5395235586166381,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "run_spec": {
      "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "us_foreign_policy"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/gemma-7b",
        "model": "google/gemma-7b",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_us_foreign_policy"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 442.2,
        "sum_squared": 195540.84,
        "min": 442.2,
        "max": 442.2,
        "mean": 442.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2844899868965149,
        "sum_squared": 0.08093455264437921,
        "min": 0.2844899868965149,
        "max": 0.2844899868965149,
        "mean": 0.2844899868965149,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 442.2,
        "sum_squared": 195540.84,
        "min": 442.2,
        "max": 442.2,
        "mean": 442.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 442.2,
        "sum_squared": 195540.84,
        "min": 442.2,
        "max": 442.2,
        "mean": 442.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2844899868965149,
        "sum_squared": 0.08093455264437921,
        "min": 0.2844899868965149,
        "max": 0.2844899868965149,
        "mean": 0.2844899868965149,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2844899868965149,
        "sum_squared": 0.08093455264437921,
        "min": 0.2844899868965149,
        "max": 0.2844899868965149,
        "mean": 0.2844899868965149,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.88,
        "sum_squared": 0.7744,
        "min": 0.88,
        "max": 0.88,
        "mean": 0.88,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-7b-it,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "run_spec": {
      "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_gemma-7b-it,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "us_foreign_policy"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/gemma-7b-it",
        "model": "google/gemma-7b-it",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_us_foreign_policy"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 442.2,
        "sum_squared": 195540.84,
        "min": 442.2,
        "max": 442.2,
        "mean": 442.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2597520995140076,
        "sum_squared": 0.06747115320193489,
        "min": 0.2597520995140076,
        "max": 0.2597520995140076,
        "mean": 0.2597520995140076,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 442.2,
        "sum_squared": 195540.84,
        "min": 442.2,
        "max": 442.2,
        "mean": 442.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 442.2,
        "sum_squared": 195540.84,
        "min": 442.2,
        "max": 442.2,
        "mean": 442.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2597520995140076,
        "sum_squared": 0.06747115320193489,
        "min": 0.2597520995140076,
        "max": 0.2597520995140076,
        "mean": 0.2597520995140076,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2597520995140076,
        "sum_squared": 0.06747115320193489,
        "min": 0.2597520995140076,
        "max": 0.2597520995140076,
        "mean": 0.2597520995140076,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29,
        "sum_squared": 0.0841,
        "min": 0.29,
        "max": 0.29,
        "mean": 0.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_text-bison@001,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "run_spec": {
      "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_text-bison@001,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "us_foreign_policy"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/text-bison@001",
        "model": "google/text-bison@001",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_us_foreign_policy"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 452.48,
        "sum_squared": 204738.1504,
        "min": 452.48,
        "max": 452.48,
        "mean": 452.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.765197811126709,
        "sum_squared": 0.5855276901531067,
        "min": 0.765197811126709,
        "max": 0.765197811126709,
        "mean": 0.765197811126709,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 452.48,
        "sum_squared": 204738.1504,
        "min": 452.48,
        "max": 452.48,
        "mean": 452.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 452.48,
        "sum_squared": 204738.1504,
        "min": 452.48,
        "max": 452.48,
        "mean": 452.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.765197811126709,
        "sum_squared": 0.5855276901531067,
        "min": 0.765197811126709,
        "max": 0.765197811126709,
        "mean": 0.765197811126709,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.765197811126709,
        "sum_squared": 0.5855276901531067,
        "min": 0.765197811126709,
        "max": 0.765197811126709,
        "mean": 0.765197811126709,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9,
        "sum_squared": 0.81,
        "min": 0.9,
        "max": 0.9,
        "mean": 0.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_text-unicorn@001,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "run_spec": {
      "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=google_text-unicorn@001,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "us_foreign_policy"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/text-unicorn@001",
        "model": "google/text-unicorn@001",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_us_foreign_policy"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 452.48,
        "sum_squared": 204738.1504,
        "min": 452.48,
        "max": 452.48,
        "mean": 452.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.3936432957649232,
        "sum_squared": 1.9422416358305172,
        "min": 1.3936432957649232,
        "max": 1.3936432957649232,
        "mean": 1.3936432957649232,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 452.48,
        "sum_squared": 204738.1504,
        "min": 452.48,
        "max": 452.48,
        "mean": 452.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 452.48,
        "sum_squared": 204738.1504,
        "min": 452.48,
        "max": 452.48,
        "mean": 452.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.3936432957649232,
        "sum_squared": 1.9422416358305172,
        "min": 1.3936432957649232,
        "max": 1.3936432957649232,
        "mean": 1.3936432957649232,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.3936432957649232,
        "sum_squared": 1.9422416358305172,
        "min": 1.3936432957649232,
        "max": 1.3936432957649232,
        "mean": 1.3936432957649232,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.92,
        "sum_squared": 0.8464,
        "min": 0.92,
        "max": 0.92,
        "mean": 0.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "run_spec": {
      "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=meta_llama-2-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "us_foreign_policy"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-2-7b",
        "model": "meta/llama-2-7b",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_us_foreign_policy"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 491.81,
        "sum_squared": 241877.0761,
        "min": 491.81,
        "max": 491.81,
        "mean": 491.81,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2887767219543457,
        "sum_squared": 0.0833919951426975,
        "min": 0.2887767219543457,
        "max": 0.2887767219543457,
        "mean": 0.2887767219543457,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 491.81,
        "sum_squared": 241877.0761,
        "min": 491.81,
        "max": 491.81,
        "mean": 491.81,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 491.81,
        "sum_squared": 241877.0761,
        "min": 491.81,
        "max": 491.81,
        "mean": 491.81,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2887767219543457,
        "sum_squared": 0.0833919951426975,
        "min": 0.2887767219543457,
        "max": 0.2887767219543457,
        "mean": 0.2887767219543457,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2887767219543457,
        "sum_squared": 0.0833919951426975,
        "min": 0.2887767219543457,
        "max": 0.2887767219543457,
        "mean": 0.2887767219543457,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.65,
        "sum_squared": 0.42250000000000004,
        "min": 0.65,
        "max": 0.65,
        "mean": 0.65,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-2,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "run_spec": {
      "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=microsoft_phi-2,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "us_foreign_policy"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/phi-2",
        "model": "microsoft/phi-2",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_us_foreign_policy"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 445.12,
        "sum_squared": 198131.8144,
        "min": 445.12,
        "max": 445.12,
        "mean": 445.12,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.28353949546813967,
        "sum_squared": 0.0803946454903272,
        "min": 0.28353949546813967,
        "max": 0.28353949546813967,
        "mean": 0.28353949546813967,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 445.12,
        "sum_squared": 198131.8144,
        "min": 445.12,
        "max": 445.12,
        "mean": 445.12,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 445.12,
        "sum_squared": 198131.8144,
        "min": 445.12,
        "max": 445.12,
        "mean": 445.12,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.28353949546813967,
        "sum_squared": 0.0803946454903272,
        "min": 0.28353949546813967,
        "max": 0.28353949546813967,
        "mean": 0.28353949546813967,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.28353949546813967,
        "sum_squared": 0.0803946454903272,
        "min": 0.28353949546813967,
        "max": 0.28353949546813967,
        "mean": 0.28353949546813967,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.78,
        "sum_squared": 0.6084,
        "min": 0.78,
        "max": 0.78,
        "mean": 0.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "run_spec": {
      "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=mistralai_mixtral-8x7b-32kseqlen,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "us_foreign_policy"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/mixtral-8x7b-32kseqlen",
        "model": "mistralai/mixtral-8x7b-32kseqlen",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_us_foreign_policy"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 485.19,
        "sum_squared": 235409.3361,
        "min": 485.19,
        "max": 485.19,
        "mean": 485.19,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.34373082876205446,
        "sum_squared": 0.11815088264144881,
        "min": 0.34373082876205446,
        "max": 0.34373082876205446,
        "mean": 0.34373082876205446,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 485.19,
        "sum_squared": 235409.3361,
        "min": 485.19,
        "max": 485.19,
        "mean": 485.19,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 485.19,
        "sum_squared": 235409.3361,
        "min": 485.19,
        "max": 485.19,
        "mean": 485.19,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.34373082876205446,
        "sum_squared": 0.11815088264144881,
        "min": 0.34373082876205446,
        "max": 0.34373082876205446,
        "mean": 0.34373082876205446,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.34373082876205446,
        "sum_squared": 0.11815088264144881,
        "min": 0.34373082876205446,
        "max": 0.34373082876205446,
        "mean": 0.34373082876205446,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.93,
        "sum_squared": 0.8649000000000001,
        "min": 0.93,
        "max": 0.93,
        "mean": 0.93,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "run_spec": {
      "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-3.5-turbo-0613,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "us_foreign_policy"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-3.5-turbo-0613",
        "model": "openai/gpt-3.5-turbo-0613",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_us_foreign_policy"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 427.79,
        "sum_squared": 183004.28410000002,
        "min": 427.79,
        "max": 427.79,
        "mean": 427.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.40241742610931397,
        "sum_squared": 0.16193978483644517,
        "min": 0.40241742610931397,
        "max": 0.40241742610931397,
        "mean": 0.40241742610931397,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 427.79,
        "sum_squared": 183004.28410000002,
        "min": 427.79,
        "max": 427.79,
        "mean": 427.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 427.79,
        "sum_squared": 183004.28410000002,
        "min": 427.79,
        "max": 427.79,
        "mean": 427.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.40241742610931397,
        "sum_squared": 0.16193978483644517,
        "min": 0.40241742610931397,
        "max": 0.40241742610931397,
        "mean": 0.40241742610931397,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.40241742610931397,
        "sum_squared": 0.16193978483644517,
        "min": 0.40241742610931397,
        "max": 0.40241742610931397,
        "mean": 0.40241742610931397,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.87,
        "sum_squared": 0.7569,
        "min": 0.87,
        "max": 0.87,
        "mean": 0.87,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-1106-preview,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "run_spec": {
      "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=openai_gpt-4-1106-preview,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "us_foreign_policy"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-4-1106-preview",
        "model": "openai/gpt-4-1106-preview",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_us_foreign_policy"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 427.79,
        "sum_squared": 183004.28410000002,
        "min": 427.79,
        "max": 427.79,
        "mean": 427.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5122368717193604,
        "sum_squared": 0.26238661274883646,
        "min": 0.5122368717193604,
        "max": 0.5122368717193604,
        "mean": 0.5122368717193604,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 427.79,
        "sum_squared": 183004.28410000002,
        "min": 427.79,
        "max": 427.79,
        "mean": 427.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 427.79,
        "sum_squared": 183004.28410000002,
        "min": 427.79,
        "max": 427.79,
        "mean": 427.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5122368717193604,
        "sum_squared": 0.26238661274883646,
        "min": 0.5122368717193604,
        "max": 0.5122368717193604,
        "mean": 0.5122368717193604,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5122368717193604,
        "sum_squared": 0.26238661274883646,
        "min": 0.5122368717193604,
        "max": 0.5122368717193604,
        "mean": 0.5122368717193604,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.94,
        "sum_squared": 0.8835999999999999,
        "min": 0.94,
        "max": 0.94,
        "mean": 0.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
    "run_spec": {
      "name": "mmlu:subject=us_foreign_policy,method=multiple_choice_joint,model=qwen_qwen1.5-7b,eval_split=test,additional_instructions=yifan,groups=mmlu_us_foreign_policy",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.mmlu_scenario.MMLUScenario",
        "args": {
          "subject": "us_foreign_policy"
        }
      },
      "adapter_spec": {
        "method": "multiple_choice_joint",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "The following are multiple choice questions (with answers) about us foreign policy. Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 10000,
        "num_outputs": 5,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/qwen1.5-7b",
        "model": "qwen/qwen1.5-7b",
        "temperature": 0.0,
        "max_tokens": 1,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false,
        "eval_splits": [
          "test"
        ]
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "mmlu",
        "mmlu_us_foreign_policy"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 433.16,
        "sum_squared": 187627.58560000002,
        "min": 433.16,
        "max": 433.16,
        "mean": 433.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3014155149459839,
        "sum_squared": 0.09085131265015264,
        "min": 0.3014155149459839,
        "max": 0.3014155149459839,
        "mean": 0.3014155149459839,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test"
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.0,
        "sum_squared": 16.0,
        "min": 4.0,
        "max": 4.0,
        "mean": 4.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 433.16,
        "sum_squared": 187627.58560000002,
        "min": 433.16,
        "max": 433.16,
        "mean": 433.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 433.16,
        "sum_squared": 187627.58560000002,
        "min": 433.16,
        "max": 433.16,
        "mean": 433.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3014155149459839,
        "sum_squared": 0.09085131265015264,
        "min": 0.3014155149459839,
        "max": 0.3014155149459839,
        "mean": 0.3014155149459839,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3014155149459839,
        "sum_squared": 0.09085131265015264,
        "min": 0.3014155149459839,
        "max": 0.3014155149459839,
        "mean": 0.3014155149459839,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match@5",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.85,
        "sum_squared": 0.7224999999999999,
        "min": 0.85,
        "max": 0.85,
        "mean": 0.85,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/narrative_qa:model=01-ai_yi-6b,additional_instructions=narrative_qa",
    "run_spec": {
      "name": "narrative_qa:model=01-ai_yi-6b,additional_instructions=narrative_qa",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": " Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Passage: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/yi-6b",
        "model": "01-ai/yi-6b",
        "temperature": 0.0,
        "max_tokens": 100,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score",
              "rouge_l",
              "bleu_1",
              "bleu_4"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "narrative_qa"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 3647.74358974359,
        "sum_squared": 13306033.296515452,
        "min": 3647.74358974359,
        "max": 3647.74358974359,
        "mean": 3647.74358974359,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.3853136331607134,
        "sum_squared": 1.9190938622209357,
        "min": 1.3853136331607134,
        "max": 1.3853136331607134,
        "mean": 1.3853136331607134,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 4.871794871794871,
        "sum_squared": 23.734385272846808,
        "min": 4.871794871794871,
        "max": 4.871794871794871,
        "mean": 4.871794871794871,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3076923076923077,
        "sum_squared": 0.09467455621301776,
        "min": 0.3076923076923077,
        "max": 0.3076923076923077,
        "mean": 0.3076923076923077,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.48717948717948717,
        "sum_squared": 0.2373438527284681,
        "min": 0.48717948717948717,
        "max": 0.48717948717948717,
        "mean": 0.48717948717948717,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6709499615429599,
        "sum_squared": 0.4501738508944994,
        "min": 0.6709499615429599,
        "max": 0.6709499615429599,
        "mean": 0.6709499615429599,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test"
        },
        "count": 1,
        "sum": 0.691480014238635,
        "sum_squared": 0.4781446100914629,
        "min": 0.691480014238635,
        "max": 0.691480014238635,
        "mean": 0.691480014238635,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5407203931382368,
        "sum_squared": 0.2923785435555694,
        "min": 0.5407203931382368,
        "max": 0.5407203931382368,
        "mean": 0.5407203931382368,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test"
        },
        "count": 1,
        "sum": 0.08205128205128205,
        "sum_squared": 0.00673241288625904,
        "min": 0.08205128205128205,
        "max": 0.08205128205128205,
        "mean": 0.08205128205128205,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 23.487179487179485,
        "sum_squared": 551.6476002629848,
        "min": 23.487179487179485,
        "max": 23.487179487179485,
        "mean": 23.487179487179485,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 3638.181818181818,
        "sum_squared": 13236366.94214876,
        "min": 3638.181818181818,
        "max": 3638.181818181818,
        "mean": 3638.181818181818,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.394901535727761,
        "sum_squared": 1.9457502943756662,
        "min": 1.394901535727761,
        "max": 1.394901535727761,
        "mean": 1.394901535727761,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 4.909090909090909,
        "sum_squared": 24.09917355371901,
        "min": 4.909090909090909,
        "max": 4.909090909090909,
        "mean": 4.909090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.09090909090909091,
        "sum_squared": 0.008264462809917356,
        "min": 0.09090909090909091,
        "max": 0.09090909090909091,
        "mean": 0.09090909090909091,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.45454545454545453,
        "sum_squared": 0.20661157024793386,
        "min": 0.45454545454545453,
        "max": 0.45454545454545453,
        "mean": 0.45454545454545453,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.7487313324633899,
        "sum_squared": 0.5605986082124033,
        "min": 0.7487313324633899,
        "max": 0.7487313324633899,
        "mean": 0.7487313324633899,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.829689608636977,
        "sum_squared": 0.6883848466801802,
        "min": 0.829689608636977,
        "max": 0.829689608636977,
        "mean": 0.829689608636977,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.5569414914948082,
        "sum_squared": 0.31018382494846153,
        "min": 0.5569414914948082,
        "max": 0.5569414914948082,
        "mean": 0.5569414914948082,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.15467159893503413,
        "sum_squared": 0.02392330351712005,
        "min": 0.15467159893503413,
        "max": 0.15467159893503413,
        "mean": 0.15467159893503413,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 20.272727272727273,
        "sum_squared": 410.9834710743802,
        "min": 20.272727272727273,
        "max": 20.272727272727273,
        "mean": 20.272727272727273,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3647.74358974359,
        "sum_squared": 13306033.296515452,
        "min": 3647.74358974359,
        "max": 3647.74358974359,
        "mean": 3647.74358974359,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3647.74358974359,
        "sum_squared": 13306033.296515452,
        "min": 3647.74358974359,
        "max": 3647.74358974359,
        "mean": 3647.74358974359,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.3853136331607134,
        "sum_squared": 1.9190938622209357,
        "min": 1.3853136331607134,
        "max": 1.3853136331607134,
        "mean": 1.3853136331607134,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.3853136331607134,
        "sum_squared": 1.9190938622209357,
        "min": 1.3853136331607134,
        "max": 1.3853136331607134,
        "mean": 1.3853136331607134,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.871794871794871,
        "sum_squared": 23.734385272846808,
        "min": 4.871794871794871,
        "max": 4.871794871794871,
        "mean": 4.871794871794871,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.871794871794871,
        "sum_squared": 23.734385272846808,
        "min": 4.871794871794871,
        "max": 4.871794871794871,
        "mean": 4.871794871794871,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3076923076923077,
        "sum_squared": 0.09467455621301776,
        "min": 0.3076923076923077,
        "max": 0.3076923076923077,
        "mean": 0.3076923076923077,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3076923076923077,
        "sum_squared": 0.09467455621301776,
        "min": 0.3076923076923077,
        "max": 0.3076923076923077,
        "mean": 0.3076923076923077,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48717948717948717,
        "sum_squared": 0.2373438527284681,
        "min": 0.48717948717948717,
        "max": 0.48717948717948717,
        "mean": 0.48717948717948717,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48717948717948717,
        "sum_squared": 0.2373438527284681,
        "min": 0.48717948717948717,
        "max": 0.48717948717948717,
        "mean": 0.48717948717948717,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6709499615429599,
        "sum_squared": 0.4501738508944994,
        "min": 0.6709499615429599,
        "max": 0.6709499615429599,
        "mean": 0.6709499615429599,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6709499615429599,
        "sum_squared": 0.4501738508944994,
        "min": 0.6709499615429599,
        "max": 0.6709499615429599,
        "mean": 0.6709499615429599,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.691480014238635,
        "sum_squared": 0.4781446100914629,
        "min": 0.691480014238635,
        "max": 0.691480014238635,
        "mean": 0.691480014238635,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.691480014238635,
        "sum_squared": 0.4781446100914629,
        "min": 0.691480014238635,
        "max": 0.691480014238635,
        "mean": 0.691480014238635,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5407203931382368,
        "sum_squared": 0.2923785435555694,
        "min": 0.5407203931382368,
        "max": 0.5407203931382368,
        "mean": 0.5407203931382368,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5407203931382368,
        "sum_squared": 0.2923785435555694,
        "min": 0.5407203931382368,
        "max": 0.5407203931382368,
        "mean": 0.5407203931382368,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.08205128205128205,
        "sum_squared": 0.00673241288625904,
        "min": 0.08205128205128205,
        "max": 0.08205128205128205,
        "mean": 0.08205128205128205,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.08205128205128205,
        "sum_squared": 0.00673241288625904,
        "min": 0.08205128205128205,
        "max": 0.08205128205128205,
        "mean": 0.08205128205128205,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 23.487179487179485,
        "sum_squared": 551.6476002629848,
        "min": 23.487179487179485,
        "max": 23.487179487179485,
        "mean": 23.487179487179485,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 23.487179487179485,
        "sum_squared": 551.6476002629848,
        "min": 23.487179487179485,
        "max": 23.487179487179485,
        "mean": 23.487179487179485,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3638.181818181818,
        "sum_squared": 13236366.94214876,
        "min": 3638.181818181818,
        "max": 3638.181818181818,
        "mean": 3638.181818181818,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3638.181818181818,
        "sum_squared": 13236366.94214876,
        "min": 3638.181818181818,
        "max": 3638.181818181818,
        "mean": 3638.181818181818,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.394901535727761,
        "sum_squared": 1.9457502943756662,
        "min": 1.394901535727761,
        "max": 1.394901535727761,
        "mean": 1.394901535727761,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.394901535727761,
        "sum_squared": 1.9457502943756662,
        "min": 1.394901535727761,
        "max": 1.394901535727761,
        "mean": 1.394901535727761,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.909090909090909,
        "sum_squared": 24.09917355371901,
        "min": 4.909090909090909,
        "max": 4.909090909090909,
        "mean": 4.909090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.909090909090909,
        "sum_squared": 24.09917355371901,
        "min": 4.909090909090909,
        "max": 4.909090909090909,
        "mean": 4.909090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.09090909090909091,
        "sum_squared": 0.008264462809917356,
        "min": 0.09090909090909091,
        "max": 0.09090909090909091,
        "mean": 0.09090909090909091,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.09090909090909091,
        "sum_squared": 0.008264462809917356,
        "min": 0.09090909090909091,
        "max": 0.09090909090909091,
        "mean": 0.09090909090909091,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.45454545454545453,
        "sum_squared": 0.20661157024793386,
        "min": 0.45454545454545453,
        "max": 0.45454545454545453,
        "mean": 0.45454545454545453,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.45454545454545453,
        "sum_squared": 0.20661157024793386,
        "min": 0.45454545454545453,
        "max": 0.45454545454545453,
        "mean": 0.45454545454545453,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7487313324633899,
        "sum_squared": 0.5605986082124033,
        "min": 0.7487313324633899,
        "max": 0.7487313324633899,
        "mean": 0.7487313324633899,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7487313324633899,
        "sum_squared": 0.5605986082124033,
        "min": 0.7487313324633899,
        "max": 0.7487313324633899,
        "mean": 0.7487313324633899,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.829689608636977,
        "sum_squared": 0.6883848466801802,
        "min": 0.829689608636977,
        "max": 0.829689608636977,
        "mean": 0.829689608636977,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.829689608636977,
        "sum_squared": 0.6883848466801802,
        "min": 0.829689608636977,
        "max": 0.829689608636977,
        "mean": 0.829689608636977,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5569414914948082,
        "sum_squared": 0.31018382494846153,
        "min": 0.5569414914948082,
        "max": 0.5569414914948082,
        "mean": 0.5569414914948082,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5569414914948082,
        "sum_squared": 0.31018382494846153,
        "min": 0.5569414914948082,
        "max": 0.5569414914948082,
        "mean": 0.5569414914948082,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.15467159893503413,
        "sum_squared": 0.02392330351712005,
        "min": 0.15467159893503413,
        "max": 0.15467159893503413,
        "mean": 0.15467159893503413,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.15467159893503413,
        "sum_squared": 0.02392330351712005,
        "min": 0.15467159893503413,
        "max": 0.15467159893503413,
        "mean": 0.15467159893503413,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 20.272727272727273,
        "sum_squared": 410.9834710743802,
        "min": 20.272727272727273,
        "max": 20.272727272727273,
        "mean": 20.272727272727273,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 20.272727272727273,
        "sum_squared": 410.9834710743802,
        "min": 20.272727272727273,
        "max": 20.272727272727273,
        "mean": 20.272727272727273,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 39.0,
        "sum_squared": 1521.0,
        "min": 39.0,
        "max": 39.0,
        "mean": 39.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 11.0,
        "sum_squared": 121.0,
        "min": 11.0,
        "max": 11.0,
        "mean": 11.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/narrative_qa:model=anthropic_claude-2.1,additional_instructions=narrative_qa",
    "run_spec": {
      "name": "narrative_qa:model=anthropic_claude-2.1,additional_instructions=narrative_qa",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "\n\nHuman:",
        "global_suffix": "\n\nAssistant:",
        "instructions": " Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Passage: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-2.1",
        "model": "anthropic/claude-2.1",
        "temperature": 0.0,
        "max_tokens": 100,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score",
              "rouge_l",
              "bleu_1",
              "bleu_4"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "narrative_qa"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 3673.6666666666665,
        "sum_squared": 13495826.777777776,
        "min": 3673.6666666666665,
        "max": 3673.6666666666665,
        "mean": 3673.6666666666665,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 49.84615384615385,
        "sum_squared": 2484.639053254438,
        "min": 49.84615384615385,
        "max": 49.84615384615385,
        "mean": 49.84615384615385,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 49.84615384615385,
        "sum_squared": 2484.639053254438,
        "min": 49.84615384615385,
        "max": 49.84615384615385,
        "mean": 49.84615384615385,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 5.574288093126738,
        "sum_squared": 31.07268774517452,
        "min": 5.574288093126738,
        "max": 5.574288093126738,
        "mean": 5.574288093126738,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test"
        },
        "count": 1,
        "sum": 0.19303124034683725,
        "sum_squared": 0.03726105974983845,
        "min": 0.19303124034683725,
        "max": 0.19303124034683725,
        "mean": 0.19303124034683725,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test"
        },
        "count": 1,
        "sum": 0.16609678980171586,
        "sum_squared": 0.02758814358243538,
        "min": 0.16609678980171586,
        "max": 0.16609678980171586,
        "mean": 0.16609678980171586,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test"
        },
        "count": 1,
        "sum": 0.09320178822615054,
        "sum_squared": 0.008686573328552213,
        "min": 0.09320178822615054,
        "max": 0.09320178822615054,
        "mean": 0.09320178822615054,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test"
        },
        "count": 1,
        "sum": 0.01426482047348085,
        "sum_squared": 0.0002034851031406384,
        "min": 0.01426482047348085,
        "max": 0.01426482047348085,
        "mean": 0.01426482047348085,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 49.84615384615385,
        "sum_squared": 2484.639053254438,
        "min": 49.84615384615385,
        "max": 49.84615384615385,
        "mean": 49.84615384615385,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 232.2051282051282,
        "sum_squared": 53919.221564760024,
        "min": 232.2051282051282,
        "max": 232.2051282051282,
        "mean": 232.2051282051282,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 3651.181818181818,
        "sum_squared": 13331128.669421487,
        "min": 3651.181818181818,
        "max": 3651.181818181818,
        "mean": 3651.181818181818,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 60.36363636363637,
        "sum_squared": 3643.768595041323,
        "min": 60.36363636363637,
        "max": 60.36363636363637,
        "mean": 60.36363636363637,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 60.36363636363637,
        "sum_squared": 3643.768595041323,
        "min": 60.36363636363637,
        "max": 60.36363636363637,
        "mean": 60.36363636363637,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 6.128317919644442,
        "sum_squared": 37.556280524235184,
        "min": 6.128317919644442,
        "max": 6.128317919644442,
        "mean": 6.128317919644442,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.21031996112390014,
        "sum_squared": 0.044234486047158864,
        "min": 0.21031996112390014,
        "max": 0.21031996112390014,
        "mean": 0.21031996112390014,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.17281140533829328,
        "sum_squared": 0.0298637818149959,
        "min": 0.17281140533829328,
        "max": 0.17281140533829328,
        "mean": 0.17281140533829328,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0937424902445759,
        "sum_squared": 0.008787654477254407,
        "min": 0.0937424902445759,
        "max": 0.0937424902445759,
        "mean": 0.0937424902445759,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.018912933631934525,
        "sum_squared": 0.0003576990585659601,
        "min": 0.018912933631934525,
        "max": 0.018912933631934525,
        "mean": 0.018912933631934525,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 60.36363636363637,
        "sum_squared": 3643.768595041323,
        "min": 60.36363636363637,
        "max": 60.36363636363637,
        "mean": 60.36363636363637,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 279.54545454545456,
        "sum_squared": 78145.6611570248,
        "min": 279.54545454545456,
        "max": 279.54545454545456,
        "mean": 279.54545454545456,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3673.6666666666665,
        "sum_squared": 13495826.777777776,
        "min": 3673.6666666666665,
        "max": 3673.6666666666665,
        "mean": 3673.6666666666665,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3673.6666666666665,
        "sum_squared": 13495826.777777776,
        "min": 3673.6666666666665,
        "max": 3673.6666666666665,
        "mean": 3673.6666666666665,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 49.84615384615385,
        "sum_squared": 2484.639053254438,
        "min": 49.84615384615385,
        "max": 49.84615384615385,
        "mean": 49.84615384615385,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 49.84615384615385,
        "sum_squared": 2484.639053254438,
        "min": 49.84615384615385,
        "max": 49.84615384615385,
        "mean": 49.84615384615385,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 49.84615384615385,
        "sum_squared": 2484.639053254438,
        "min": 49.84615384615385,
        "max": 49.84615384615385,
        "mean": 49.84615384615385,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 49.84615384615385,
        "sum_squared": 2484.639053254438,
        "min": 49.84615384615385,
        "max": 49.84615384615385,
        "mean": 49.84615384615385,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.574288093126738,
        "sum_squared": 31.07268774517452,
        "min": 5.574288093126738,
        "max": 5.574288093126738,
        "mean": 5.574288093126738,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.574288093126738,
        "sum_squared": 31.07268774517452,
        "min": 5.574288093126738,
        "max": 5.574288093126738,
        "mean": 5.574288093126738,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.19303124034683725,
        "sum_squared": 0.03726105974983845,
        "min": 0.19303124034683725,
        "max": 0.19303124034683725,
        "mean": 0.19303124034683725,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.19303124034683725,
        "sum_squared": 0.03726105974983845,
        "min": 0.19303124034683725,
        "max": 0.19303124034683725,
        "mean": 0.19303124034683725,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.16609678980171586,
        "sum_squared": 0.02758814358243538,
        "min": 0.16609678980171586,
        "max": 0.16609678980171586,
        "mean": 0.16609678980171586,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.16609678980171586,
        "sum_squared": 0.02758814358243538,
        "min": 0.16609678980171586,
        "max": 0.16609678980171586,
        "mean": 0.16609678980171586,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.09320178822615054,
        "sum_squared": 0.008686573328552213,
        "min": 0.09320178822615054,
        "max": 0.09320178822615054,
        "mean": 0.09320178822615054,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.09320178822615054,
        "sum_squared": 0.008686573328552213,
        "min": 0.09320178822615054,
        "max": 0.09320178822615054,
        "mean": 0.09320178822615054,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.01426482047348085,
        "sum_squared": 0.0002034851031406384,
        "min": 0.01426482047348085,
        "max": 0.01426482047348085,
        "mean": 0.01426482047348085,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.01426482047348085,
        "sum_squared": 0.0002034851031406384,
        "min": 0.01426482047348085,
        "max": 0.01426482047348085,
        "mean": 0.01426482047348085,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 49.84615384615385,
        "sum_squared": 2484.639053254438,
        "min": 49.84615384615385,
        "max": 49.84615384615385,
        "mean": 49.84615384615385,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 49.84615384615385,
        "sum_squared": 2484.639053254438,
        "min": 49.84615384615385,
        "max": 49.84615384615385,
        "mean": 49.84615384615385,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 232.2051282051282,
        "sum_squared": 53919.221564760024,
        "min": 232.2051282051282,
        "max": 232.2051282051282,
        "mean": 232.2051282051282,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 232.2051282051282,
        "sum_squared": 53919.221564760024,
        "min": 232.2051282051282,
        "max": 232.2051282051282,
        "mean": 232.2051282051282,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3651.181818181818,
        "sum_squared": 13331128.669421487,
        "min": 3651.181818181818,
        "max": 3651.181818181818,
        "mean": 3651.181818181818,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3651.181818181818,
        "sum_squared": 13331128.669421487,
        "min": 3651.181818181818,
        "max": 3651.181818181818,
        "mean": 3651.181818181818,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 60.36363636363637,
        "sum_squared": 3643.768595041323,
        "min": 60.36363636363637,
        "max": 60.36363636363637,
        "mean": 60.36363636363637,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 60.36363636363637,
        "sum_squared": 3643.768595041323,
        "min": 60.36363636363637,
        "max": 60.36363636363637,
        "mean": 60.36363636363637,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 60.36363636363637,
        "sum_squared": 3643.768595041323,
        "min": 60.36363636363637,
        "max": 60.36363636363637,
        "mean": 60.36363636363637,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 60.36363636363637,
        "sum_squared": 3643.768595041323,
        "min": 60.36363636363637,
        "max": 60.36363636363637,
        "mean": 60.36363636363637,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 6.128317919644442,
        "sum_squared": 37.556280524235184,
        "min": 6.128317919644442,
        "max": 6.128317919644442,
        "mean": 6.128317919644442,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 6.128317919644442,
        "sum_squared": 37.556280524235184,
        "min": 6.128317919644442,
        "max": 6.128317919644442,
        "mean": 6.128317919644442,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.21031996112390014,
        "sum_squared": 0.044234486047158864,
        "min": 0.21031996112390014,
        "max": 0.21031996112390014,
        "mean": 0.21031996112390014,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.21031996112390014,
        "sum_squared": 0.044234486047158864,
        "min": 0.21031996112390014,
        "max": 0.21031996112390014,
        "mean": 0.21031996112390014,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.17281140533829328,
        "sum_squared": 0.0298637818149959,
        "min": 0.17281140533829328,
        "max": 0.17281140533829328,
        "mean": 0.17281140533829328,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.17281140533829328,
        "sum_squared": 0.0298637818149959,
        "min": 0.17281140533829328,
        "max": 0.17281140533829328,
        "mean": 0.17281140533829328,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0937424902445759,
        "sum_squared": 0.008787654477254407,
        "min": 0.0937424902445759,
        "max": 0.0937424902445759,
        "mean": 0.0937424902445759,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0937424902445759,
        "sum_squared": 0.008787654477254407,
        "min": 0.0937424902445759,
        "max": 0.0937424902445759,
        "mean": 0.0937424902445759,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.018912933631934525,
        "sum_squared": 0.0003576990585659601,
        "min": 0.018912933631934525,
        "max": 0.018912933631934525,
        "mean": 0.018912933631934525,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.018912933631934525,
        "sum_squared": 0.0003576990585659601,
        "min": 0.018912933631934525,
        "max": 0.018912933631934525,
        "mean": 0.018912933631934525,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 60.36363636363637,
        "sum_squared": 3643.768595041323,
        "min": 60.36363636363637,
        "max": 60.36363636363637,
        "mean": 60.36363636363637,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 60.36363636363637,
        "sum_squared": 3643.768595041323,
        "min": 60.36363636363637,
        "max": 60.36363636363637,
        "mean": 60.36363636363637,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 279.54545454545456,
        "sum_squared": 78145.6611570248,
        "min": 279.54545454545456,
        "max": 279.54545454545456,
        "mean": 279.54545454545456,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 279.54545454545456,
        "sum_squared": 78145.6611570248,
        "min": 279.54545454545456,
        "max": 279.54545454545456,
        "mean": 279.54545454545456,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 39.0,
        "sum_squared": 1521.0,
        "min": 39.0,
        "max": 39.0,
        "mean": 39.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 11.0,
        "sum_squared": 121.0,
        "min": 11.0,
        "max": 11.0,
        "mean": 11.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/narrative_qa:model=anthropic_claude-instant-1.2,additional_instructions=narrative_qa",
    "run_spec": {
      "name": "narrative_qa:model=anthropic_claude-instant-1.2,additional_instructions=narrative_qa",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "\n\nHuman:",
        "global_suffix": "\n\nAssistant:",
        "instructions": " Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Passage: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-instant-1.2",
        "model": "anthropic/claude-instant-1.2",
        "temperature": 0.0,
        "max_tokens": 100,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score",
              "rouge_l",
              "bleu_1",
              "bleu_4"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "narrative_qa"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 3673.6666666666665,
        "sum_squared": 13495826.777777776,
        "min": 3673.6666666666665,
        "max": 3673.6666666666665,
        "mean": 3673.6666666666665,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 49.17948717948718,
        "sum_squared": 2418.621959237344,
        "min": 49.17948717948718,
        "max": 49.17948717948718,
        "mean": 49.17948717948718,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 49.17948717948718,
        "sum_squared": 2418.621959237344,
        "min": 49.17948717948718,
        "max": 49.17948717948718,
        "mean": 49.17948717948718,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 2.67323461557046,
        "sum_squared": 7.146183309884144,
        "min": 2.67323461557046,
        "max": 2.67323461557046,
        "mean": 2.67323461557046,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test"
        },
        "count": 1,
        "sum": 0.06143893649725142,
        "sum_squared": 0.0037747429179132925,
        "min": 0.06143893649725142,
        "max": 0.06143893649725142,
        "mean": 0.06143893649725142,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test"
        },
        "count": 1,
        "sum": 0.054092211099347004,
        "sum_squared": 0.0029259673016163194,
        "min": 0.054092211099347004,
        "max": 0.054092211099347004,
        "mean": 0.054092211099347004,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test"
        },
        "count": 1,
        "sum": 0.044360524649134446,
        "sum_squared": 0.0019678561471464648,
        "min": 0.044360524649134446,
        "max": 0.044360524649134446,
        "mean": 0.044360524649134446,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0003373819163292848,
        "sum_squared": 1.1382655746602052e-07,
        "min": 0.0003373819163292848,
        "max": 0.0003373819163292848,
        "mean": 0.0003373819163292848,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 49.17948717948718,
        "sum_squared": 2418.621959237344,
        "min": 49.17948717948718,
        "max": 49.17948717948718,
        "mean": 49.17948717948718,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 237.28205128205127,
        "sum_squared": 56302.771860618006,
        "min": 237.28205128205127,
        "max": 237.28205128205127,
        "mean": 237.28205128205127,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 3651.181818181818,
        "sum_squared": 13331128.669421487,
        "min": 3651.181818181818,
        "max": 3651.181818181818,
        "mean": 3651.181818181818,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 45.09090909090909,
        "sum_squared": 2033.1900826446283,
        "min": 45.09090909090909,
        "max": 45.09090909090909,
        "mean": 45.09090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 45.09090909090909,
        "sum_squared": 2033.1900826446283,
        "min": 45.09090909090909,
        "max": 45.09090909090909,
        "mean": 45.09090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 3.179094661365856,
        "sum_squared": 10.106642865924886,
        "min": 3.179094661365856,
        "max": 3.179094661365856,
        "mean": 3.179094661365856,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.08343475139607866,
        "sum_squared": 0.00696135774052545,
        "min": 0.08343475139607866,
        "max": 0.08343475139607866,
        "mean": 0.08343475139607866,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0847823066413881,
        "sum_squared": 0.00718803951943436,
        "min": 0.0847823066413881,
        "max": 0.0847823066413881,
        "mean": 0.0847823066413881,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.07162353118066413,
        "sum_squared": 0.005129930218787567,
        "min": 0.07162353118066413,
        "max": 0.07162353118066413,
        "mean": 0.07162353118066413,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.225073858507263e-308,
        "sum_squared": 0.0,
        "min": 2.225073858507263e-308,
        "max": 2.225073858507263e-308,
        "mean": 2.225073858507263e-308,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 45.09090909090909,
        "sum_squared": 2033.1900826446283,
        "min": 45.09090909090909,
        "max": 45.09090909090909,
        "mean": 45.09090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 213.0,
        "sum_squared": 45369.0,
        "min": 213.0,
        "max": 213.0,
        "mean": 213.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3673.6666666666665,
        "sum_squared": 13495826.777777776,
        "min": 3673.6666666666665,
        "max": 3673.6666666666665,
        "mean": 3673.6666666666665,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3673.6666666666665,
        "sum_squared": 13495826.777777776,
        "min": 3673.6666666666665,
        "max": 3673.6666666666665,
        "mean": 3673.6666666666665,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 49.17948717948718,
        "sum_squared": 2418.621959237344,
        "min": 49.17948717948718,
        "max": 49.17948717948718,
        "mean": 49.17948717948718,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 49.17948717948718,
        "sum_squared": 2418.621959237344,
        "min": 49.17948717948718,
        "max": 49.17948717948718,
        "mean": 49.17948717948718,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 49.17948717948718,
        "sum_squared": 2418.621959237344,
        "min": 49.17948717948718,
        "max": 49.17948717948718,
        "mean": 49.17948717948718,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 49.17948717948718,
        "sum_squared": 2418.621959237344,
        "min": 49.17948717948718,
        "max": 49.17948717948718,
        "mean": 49.17948717948718,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.67323461557046,
        "sum_squared": 7.146183309884144,
        "min": 2.67323461557046,
        "max": 2.67323461557046,
        "mean": 2.67323461557046,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.67323461557046,
        "sum_squared": 7.146183309884144,
        "min": 2.67323461557046,
        "max": 2.67323461557046,
        "mean": 2.67323461557046,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.06143893649725142,
        "sum_squared": 0.0037747429179132925,
        "min": 0.06143893649725142,
        "max": 0.06143893649725142,
        "mean": 0.06143893649725142,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.06143893649725142,
        "sum_squared": 0.0037747429179132925,
        "min": 0.06143893649725142,
        "max": 0.06143893649725142,
        "mean": 0.06143893649725142,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.054092211099347004,
        "sum_squared": 0.0029259673016163194,
        "min": 0.054092211099347004,
        "max": 0.054092211099347004,
        "mean": 0.054092211099347004,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.054092211099347004,
        "sum_squared": 0.0029259673016163194,
        "min": 0.054092211099347004,
        "max": 0.054092211099347004,
        "mean": 0.054092211099347004,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.044360524649134446,
        "sum_squared": 0.0019678561471464648,
        "min": 0.044360524649134446,
        "max": 0.044360524649134446,
        "mean": 0.044360524649134446,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.044360524649134446,
        "sum_squared": 0.0019678561471464648,
        "min": 0.044360524649134446,
        "max": 0.044360524649134446,
        "mean": 0.044360524649134446,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0003373819163292848,
        "sum_squared": 1.1382655746602052e-07,
        "min": 0.0003373819163292848,
        "max": 0.0003373819163292848,
        "mean": 0.0003373819163292848,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0003373819163292848,
        "sum_squared": 1.1382655746602052e-07,
        "min": 0.0003373819163292848,
        "max": 0.0003373819163292848,
        "mean": 0.0003373819163292848,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 49.17948717948718,
        "sum_squared": 2418.621959237344,
        "min": 49.17948717948718,
        "max": 49.17948717948718,
        "mean": 49.17948717948718,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 49.17948717948718,
        "sum_squared": 2418.621959237344,
        "min": 49.17948717948718,
        "max": 49.17948717948718,
        "mean": 49.17948717948718,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 237.28205128205127,
        "sum_squared": 56302.771860618006,
        "min": 237.28205128205127,
        "max": 237.28205128205127,
        "mean": 237.28205128205127,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 237.28205128205127,
        "sum_squared": 56302.771860618006,
        "min": 237.28205128205127,
        "max": 237.28205128205127,
        "mean": 237.28205128205127,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3651.181818181818,
        "sum_squared": 13331128.669421487,
        "min": 3651.181818181818,
        "max": 3651.181818181818,
        "mean": 3651.181818181818,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3651.181818181818,
        "sum_squared": 13331128.669421487,
        "min": 3651.181818181818,
        "max": 3651.181818181818,
        "mean": 3651.181818181818,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 45.09090909090909,
        "sum_squared": 2033.1900826446283,
        "min": 45.09090909090909,
        "max": 45.09090909090909,
        "mean": 45.09090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 45.09090909090909,
        "sum_squared": 2033.1900826446283,
        "min": 45.09090909090909,
        "max": 45.09090909090909,
        "mean": 45.09090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 45.09090909090909,
        "sum_squared": 2033.1900826446283,
        "min": 45.09090909090909,
        "max": 45.09090909090909,
        "mean": 45.09090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 45.09090909090909,
        "sum_squared": 2033.1900826446283,
        "min": 45.09090909090909,
        "max": 45.09090909090909,
        "mean": 45.09090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.179094661365856,
        "sum_squared": 10.106642865924886,
        "min": 3.179094661365856,
        "max": 3.179094661365856,
        "mean": 3.179094661365856,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.179094661365856,
        "sum_squared": 10.106642865924886,
        "min": 3.179094661365856,
        "max": 3.179094661365856,
        "mean": 3.179094661365856,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.08343475139607866,
        "sum_squared": 0.00696135774052545,
        "min": 0.08343475139607866,
        "max": 0.08343475139607866,
        "mean": 0.08343475139607866,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.08343475139607866,
        "sum_squared": 0.00696135774052545,
        "min": 0.08343475139607866,
        "max": 0.08343475139607866,
        "mean": 0.08343475139607866,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0847823066413881,
        "sum_squared": 0.00718803951943436,
        "min": 0.0847823066413881,
        "max": 0.0847823066413881,
        "mean": 0.0847823066413881,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0847823066413881,
        "sum_squared": 0.00718803951943436,
        "min": 0.0847823066413881,
        "max": 0.0847823066413881,
        "mean": 0.0847823066413881,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.07162353118066413,
        "sum_squared": 0.005129930218787567,
        "min": 0.07162353118066413,
        "max": 0.07162353118066413,
        "mean": 0.07162353118066413,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.07162353118066413,
        "sum_squared": 0.005129930218787567,
        "min": 0.07162353118066413,
        "max": 0.07162353118066413,
        "mean": 0.07162353118066413,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.225073858507263e-308,
        "sum_squared": 0.0,
        "min": 2.225073858507263e-308,
        "max": 2.225073858507263e-308,
        "mean": 2.225073858507263e-308,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.225073858507263e-308,
        "sum_squared": 0.0,
        "min": 2.225073858507263e-308,
        "max": 2.225073858507263e-308,
        "mean": 2.225073858507263e-308,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 45.09090909090909,
        "sum_squared": 2033.1900826446283,
        "min": 45.09090909090909,
        "max": 45.09090909090909,
        "mean": 45.09090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 45.09090909090909,
        "sum_squared": 2033.1900826446283,
        "min": 45.09090909090909,
        "max": 45.09090909090909,
        "mean": 45.09090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 213.0,
        "sum_squared": 45369.0,
        "min": 213.0,
        "max": 213.0,
        "mean": 213.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 213.0,
        "sum_squared": 45369.0,
        "min": 213.0,
        "max": 213.0,
        "mean": 213.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 39.0,
        "sum_squared": 1521.0,
        "min": 39.0,
        "max": 39.0,
        "mean": 39.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 11.0,
        "sum_squared": 121.0,
        "min": 11.0,
        "max": 11.0,
        "mean": 11.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/narrative_qa:model=google_gemma-7b,additional_instructions=narrative_qa",
    "run_spec": {
      "name": "narrative_qa:model=google_gemma-7b,additional_instructions=narrative_qa",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": " Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Passage: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/gemma-7b",
        "model": "google/gemma-7b",
        "temperature": 0.0,
        "max_tokens": 100,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score",
              "rouge_l",
              "bleu_1",
              "bleu_4"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "narrative_qa"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 3441.25641025641,
        "sum_squared": 11842245.681130834,
        "min": 3441.25641025641,
        "max": 3441.25641025641,
        "mean": 3441.25641025641,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5675170727265186,
        "sum_squared": 0.32207562783607663,
        "min": 0.5675170727265186,
        "max": 0.5675170727265186,
        "mean": 0.5675170727265186,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2564102564102564,
        "sum_squared": 0.06574621959237342,
        "min": 0.2564102564102564,
        "max": 0.2564102564102564,
        "mean": 0.2564102564102564,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.48717948717948717,
        "sum_squared": 0.2373438527284681,
        "min": 0.48717948717948717,
        "max": 0.48717948717948717,
        "mean": 0.48717948717948717,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7584184292329089,
        "sum_squared": 0.5751985138001129,
        "min": 0.7584184292329089,
        "max": 0.7584184292329089,
        "mean": 0.7584184292329089,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7332347306330903,
        "sum_squared": 0.5376331702065805,
        "min": 0.7332347306330903,
        "max": 0.7332347306330903,
        "mean": 0.7332347306330903,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5118731605701805,
        "sum_squared": 0.2620141325121058,
        "min": 0.5118731605701805,
        "max": 0.5118731605701805,
        "mean": 0.5118731605701805,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test"
        },
        "count": 1,
        "sum": 0.06154303430861885,
        "sum_squared": 0.003787545071911837,
        "min": 0.06154303430861885,
        "max": 0.06154303430861885,
        "mean": 0.06154303430861885,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 26.384615384615383,
        "sum_squared": 696.1479289940828,
        "min": 26.384615384615383,
        "max": 26.384615384615383,
        "mean": 26.384615384615383,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 3415.090909090909,
        "sum_squared": 11662845.917355372,
        "min": 3415.090909090909,
        "max": 3415.090909090909,
        "mean": 3415.090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.5964052026922052,
        "sum_squared": 0.3556991657983304,
        "min": 0.5964052026922052,
        "max": 0.5964052026922052,
        "mean": 0.5964052026922052,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.36363636363636365,
        "sum_squared": 0.1322314049586777,
        "min": 0.36363636363636365,
        "max": 0.36363636363636365,
        "mean": 0.36363636363636365,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.6363636363636364,
        "sum_squared": 0.4049586776859504,
        "min": 0.6363636363636364,
        "max": 0.6363636363636364,
        "mean": 0.6363636363636364,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.8636363636363636,
        "sum_squared": 0.7458677685950413,
        "min": 0.8636363636363636,
        "max": 0.8636363636363636,
        "mean": 0.8636363636363636,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.8535353535353536,
        "sum_squared": 0.728522599734721,
        "min": 0.8535353535353536,
        "max": 0.8535353535353536,
        "mean": 0.8535353535353536,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.6643085918142848,
        "sum_squared": 0.44130590515827806,
        "min": 0.6643085918142848,
        "max": 0.6643085918142848,
        "mean": 0.6643085918142848,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.1253455257337641,
        "sum_squared": 0.015711500821473715,
        "min": 0.1253455257337641,
        "max": 0.1253455257337641,
        "mean": 0.1253455257337641,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 18.636363636363637,
        "sum_squared": 347.3140495867769,
        "min": 18.636363636363637,
        "max": 18.636363636363637,
        "mean": 18.636363636363637,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3441.25641025641,
        "sum_squared": 11842245.681130834,
        "min": 3441.25641025641,
        "max": 3441.25641025641,
        "mean": 3441.25641025641,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3441.25641025641,
        "sum_squared": 11842245.681130834,
        "min": 3441.25641025641,
        "max": 3441.25641025641,
        "mean": 3441.25641025641,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5675170727265186,
        "sum_squared": 0.32207562783607663,
        "min": 0.5675170727265186,
        "max": 0.5675170727265186,
        "mean": 0.5675170727265186,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5675170727265186,
        "sum_squared": 0.32207562783607663,
        "min": 0.5675170727265186,
        "max": 0.5675170727265186,
        "mean": 0.5675170727265186,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2564102564102564,
        "sum_squared": 0.06574621959237342,
        "min": 0.2564102564102564,
        "max": 0.2564102564102564,
        "mean": 0.2564102564102564,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2564102564102564,
        "sum_squared": 0.06574621959237342,
        "min": 0.2564102564102564,
        "max": 0.2564102564102564,
        "mean": 0.2564102564102564,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48717948717948717,
        "sum_squared": 0.2373438527284681,
        "min": 0.48717948717948717,
        "max": 0.48717948717948717,
        "mean": 0.48717948717948717,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48717948717948717,
        "sum_squared": 0.2373438527284681,
        "min": 0.48717948717948717,
        "max": 0.48717948717948717,
        "mean": 0.48717948717948717,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7584184292329089,
        "sum_squared": 0.5751985138001129,
        "min": 0.7584184292329089,
        "max": 0.7584184292329089,
        "mean": 0.7584184292329089,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7584184292329089,
        "sum_squared": 0.5751985138001129,
        "min": 0.7584184292329089,
        "max": 0.7584184292329089,
        "mean": 0.7584184292329089,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7332347306330903,
        "sum_squared": 0.5376331702065805,
        "min": 0.7332347306330903,
        "max": 0.7332347306330903,
        "mean": 0.7332347306330903,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7332347306330903,
        "sum_squared": 0.5376331702065805,
        "min": 0.7332347306330903,
        "max": 0.7332347306330903,
        "mean": 0.7332347306330903,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5118731605701805,
        "sum_squared": 0.2620141325121058,
        "min": 0.5118731605701805,
        "max": 0.5118731605701805,
        "mean": 0.5118731605701805,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5118731605701805,
        "sum_squared": 0.2620141325121058,
        "min": 0.5118731605701805,
        "max": 0.5118731605701805,
        "mean": 0.5118731605701805,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.06154303430861885,
        "sum_squared": 0.003787545071911837,
        "min": 0.06154303430861885,
        "max": 0.06154303430861885,
        "mean": 0.06154303430861885,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.06154303430861885,
        "sum_squared": 0.003787545071911837,
        "min": 0.06154303430861885,
        "max": 0.06154303430861885,
        "mean": 0.06154303430861885,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 26.384615384615383,
        "sum_squared": 696.1479289940828,
        "min": 26.384615384615383,
        "max": 26.384615384615383,
        "mean": 26.384615384615383,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 26.384615384615383,
        "sum_squared": 696.1479289940828,
        "min": 26.384615384615383,
        "max": 26.384615384615383,
        "mean": 26.384615384615383,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3415.090909090909,
        "sum_squared": 11662845.917355372,
        "min": 3415.090909090909,
        "max": 3415.090909090909,
        "mean": 3415.090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3415.090909090909,
        "sum_squared": 11662845.917355372,
        "min": 3415.090909090909,
        "max": 3415.090909090909,
        "mean": 3415.090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5964052026922052,
        "sum_squared": 0.3556991657983304,
        "min": 0.5964052026922052,
        "max": 0.5964052026922052,
        "mean": 0.5964052026922052,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5964052026922052,
        "sum_squared": 0.3556991657983304,
        "min": 0.5964052026922052,
        "max": 0.5964052026922052,
        "mean": 0.5964052026922052,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.36363636363636365,
        "sum_squared": 0.1322314049586777,
        "min": 0.36363636363636365,
        "max": 0.36363636363636365,
        "mean": 0.36363636363636365,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.36363636363636365,
        "sum_squared": 0.1322314049586777,
        "min": 0.36363636363636365,
        "max": 0.36363636363636365,
        "mean": 0.36363636363636365,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6363636363636364,
        "sum_squared": 0.4049586776859504,
        "min": 0.6363636363636364,
        "max": 0.6363636363636364,
        "mean": 0.6363636363636364,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6363636363636364,
        "sum_squared": 0.4049586776859504,
        "min": 0.6363636363636364,
        "max": 0.6363636363636364,
        "mean": 0.6363636363636364,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8636363636363636,
        "sum_squared": 0.7458677685950413,
        "min": 0.8636363636363636,
        "max": 0.8636363636363636,
        "mean": 0.8636363636363636,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8636363636363636,
        "sum_squared": 0.7458677685950413,
        "min": 0.8636363636363636,
        "max": 0.8636363636363636,
        "mean": 0.8636363636363636,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8535353535353536,
        "sum_squared": 0.728522599734721,
        "min": 0.8535353535353536,
        "max": 0.8535353535353536,
        "mean": 0.8535353535353536,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8535353535353536,
        "sum_squared": 0.728522599734721,
        "min": 0.8535353535353536,
        "max": 0.8535353535353536,
        "mean": 0.8535353535353536,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6643085918142848,
        "sum_squared": 0.44130590515827806,
        "min": 0.6643085918142848,
        "max": 0.6643085918142848,
        "mean": 0.6643085918142848,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6643085918142848,
        "sum_squared": 0.44130590515827806,
        "min": 0.6643085918142848,
        "max": 0.6643085918142848,
        "mean": 0.6643085918142848,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.1253455257337641,
        "sum_squared": 0.015711500821473715,
        "min": 0.1253455257337641,
        "max": 0.1253455257337641,
        "mean": 0.1253455257337641,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.1253455257337641,
        "sum_squared": 0.015711500821473715,
        "min": 0.1253455257337641,
        "max": 0.1253455257337641,
        "mean": 0.1253455257337641,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 18.636363636363637,
        "sum_squared": 347.3140495867769,
        "min": 18.636363636363637,
        "max": 18.636363636363637,
        "mean": 18.636363636363637,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 18.636363636363637,
        "sum_squared": 347.3140495867769,
        "min": 18.636363636363637,
        "max": 18.636363636363637,
        "mean": 18.636363636363637,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 39.0,
        "sum_squared": 1521.0,
        "min": 39.0,
        "max": 39.0,
        "mean": 39.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 11.0,
        "sum_squared": 121.0,
        "min": 11.0,
        "max": 11.0,
        "mean": 11.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/narrative_qa:model=google_gemma-7b-it,additional_instructions=narrative_qa",
    "run_spec": {
      "name": "narrative_qa:model=google_gemma-7b-it,additional_instructions=narrative_qa",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": " Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Passage: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/gemma-7b-it",
        "model": "google/gemma-7b-it",
        "temperature": 0.0,
        "max_tokens": 100,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score",
              "rouge_l",
              "bleu_1",
              "bleu_4"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "narrative_qa"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 3441.25641025641,
        "sum_squared": 11842245.681130834,
        "min": 3441.25641025641,
        "max": 3441.25641025641,
        "mean": 3441.25641025641,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6853438890897311,
        "sum_squared": 0.4696962463126376,
        "min": 0.6853438890897311,
        "max": 0.6853438890897311,
        "mean": 0.6853438890897311,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.05128205128205128,
        "sum_squared": 0.0026298487836949372,
        "min": 0.05128205128205128,
        "max": 0.05128205128205128,
        "mean": 0.05128205128205128,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test"
        },
        "count": 1,
        "sum": 0.1284792234482637,
        "sum_squared": 0.016506910857868876,
        "min": 0.1284792234482637,
        "max": 0.1284792234482637,
        "mean": 0.1284792234482637,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test"
        },
        "count": 1,
        "sum": 0.14420910703497183,
        "sum_squared": 0.020796266551823962,
        "min": 0.14420910703497183,
        "max": 0.14420910703497183,
        "mean": 0.14420910703497183,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test"
        },
        "count": 1,
        "sum": 0.142287700294569,
        "sum_squared": 0.02024578965511709,
        "min": 0.142287700294569,
        "max": 0.142287700294569,
        "mean": 0.142287700294569,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test"
        },
        "count": 1,
        "sum": 1.237537094898655e-308,
        "sum_squared": 0.0,
        "min": 1.237537094898655e-308,
        "max": 1.237537094898655e-308,
        "mean": 1.237537094898655e-308,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 37.84615384615385,
        "sum_squared": 1432.3313609467457,
        "min": 37.84615384615385,
        "max": 37.84615384615385,
        "mean": 37.84615384615385,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 3415.090909090909,
        "sum_squared": 11662845.917355372,
        "min": 3415.090909090909,
        "max": 3415.090909090909,
        "mean": 3415.090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.6091475703499534,
        "sum_squared": 0.37106076246325137,
        "min": 0.6091475703499534,
        "max": 0.6091475703499534,
        "mean": 0.6091475703499534,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.046409472880061114,
        "sum_squared": 0.0021538391730051283,
        "min": 0.046409472880061114,
        "max": 0.046409472880061114,
        "mean": 0.046409472880061114,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.12454253614540695,
        "sum_squared": 0.015510843309529995,
        "min": 0.12454253614540695,
        "max": 0.12454253614540695,
        "mean": 0.12454253614540695,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.11477222818034778,
        "sum_squared": 0.013172664361481817,
        "min": 0.11477222818034778,
        "max": 0.11477222818034778,
        "mean": 0.11477222818034778,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.076746003390771e-308,
        "sum_squared": 0.0,
        "min": 1.076746003390771e-308,
        "max": 1.076746003390771e-308,
        "mean": 1.076746003390771e-308,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 22.636363636363637,
        "sum_squared": 512.404958677686,
        "min": 22.636363636363637,
        "max": 22.636363636363637,
        "mean": 22.636363636363637,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3441.25641025641,
        "sum_squared": 11842245.681130834,
        "min": 3441.25641025641,
        "max": 3441.25641025641,
        "mean": 3441.25641025641,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3441.25641025641,
        "sum_squared": 11842245.681130834,
        "min": 3441.25641025641,
        "max": 3441.25641025641,
        "mean": 3441.25641025641,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6853438890897311,
        "sum_squared": 0.4696962463126376,
        "min": 0.6853438890897311,
        "max": 0.6853438890897311,
        "mean": 0.6853438890897311,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6853438890897311,
        "sum_squared": 0.4696962463126376,
        "min": 0.6853438890897311,
        "max": 0.6853438890897311,
        "mean": 0.6853438890897311,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.05128205128205128,
        "sum_squared": 0.0026298487836949372,
        "min": 0.05128205128205128,
        "max": 0.05128205128205128,
        "mean": 0.05128205128205128,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.05128205128205128,
        "sum_squared": 0.0026298487836949372,
        "min": 0.05128205128205128,
        "max": 0.05128205128205128,
        "mean": 0.05128205128205128,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.1284792234482637,
        "sum_squared": 0.016506910857868876,
        "min": 0.1284792234482637,
        "max": 0.1284792234482637,
        "mean": 0.1284792234482637,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.1284792234482637,
        "sum_squared": 0.016506910857868876,
        "min": 0.1284792234482637,
        "max": 0.1284792234482637,
        "mean": 0.1284792234482637,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.14420910703497183,
        "sum_squared": 0.020796266551823962,
        "min": 0.14420910703497183,
        "max": 0.14420910703497183,
        "mean": 0.14420910703497183,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.14420910703497183,
        "sum_squared": 0.020796266551823962,
        "min": 0.14420910703497183,
        "max": 0.14420910703497183,
        "mean": 0.14420910703497183,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.142287700294569,
        "sum_squared": 0.02024578965511709,
        "min": 0.142287700294569,
        "max": 0.142287700294569,
        "mean": 0.142287700294569,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.142287700294569,
        "sum_squared": 0.02024578965511709,
        "min": 0.142287700294569,
        "max": 0.142287700294569,
        "mean": 0.142287700294569,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.237537094898655e-308,
        "sum_squared": 0.0,
        "min": 1.237537094898655e-308,
        "max": 1.237537094898655e-308,
        "mean": 1.237537094898655e-308,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.237537094898655e-308,
        "sum_squared": 0.0,
        "min": 1.237537094898655e-308,
        "max": 1.237537094898655e-308,
        "mean": 1.237537094898655e-308,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 37.84615384615385,
        "sum_squared": 1432.3313609467457,
        "min": 37.84615384615385,
        "max": 37.84615384615385,
        "mean": 37.84615384615385,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 37.84615384615385,
        "sum_squared": 1432.3313609467457,
        "min": 37.84615384615385,
        "max": 37.84615384615385,
        "mean": 37.84615384615385,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3415.090909090909,
        "sum_squared": 11662845.917355372,
        "min": 3415.090909090909,
        "max": 3415.090909090909,
        "mean": 3415.090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3415.090909090909,
        "sum_squared": 11662845.917355372,
        "min": 3415.090909090909,
        "max": 3415.090909090909,
        "mean": 3415.090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6091475703499534,
        "sum_squared": 0.37106076246325137,
        "min": 0.6091475703499534,
        "max": 0.6091475703499534,
        "mean": 0.6091475703499534,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6091475703499534,
        "sum_squared": 0.37106076246325137,
        "min": 0.6091475703499534,
        "max": 0.6091475703499534,
        "mean": 0.6091475703499534,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.046409472880061114,
        "sum_squared": 0.0021538391730051283,
        "min": 0.046409472880061114,
        "max": 0.046409472880061114,
        "mean": 0.046409472880061114,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.046409472880061114,
        "sum_squared": 0.0021538391730051283,
        "min": 0.046409472880061114,
        "max": 0.046409472880061114,
        "mean": 0.046409472880061114,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.12454253614540695,
        "sum_squared": 0.015510843309529995,
        "min": 0.12454253614540695,
        "max": 0.12454253614540695,
        "mean": 0.12454253614540695,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.12454253614540695,
        "sum_squared": 0.015510843309529995,
        "min": 0.12454253614540695,
        "max": 0.12454253614540695,
        "mean": 0.12454253614540695,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.11477222818034778,
        "sum_squared": 0.013172664361481817,
        "min": 0.11477222818034778,
        "max": 0.11477222818034778,
        "mean": 0.11477222818034778,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.11477222818034778,
        "sum_squared": 0.013172664361481817,
        "min": 0.11477222818034778,
        "max": 0.11477222818034778,
        "mean": 0.11477222818034778,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.076746003390771e-308,
        "sum_squared": 0.0,
        "min": 1.076746003390771e-308,
        "max": 1.076746003390771e-308,
        "mean": 1.076746003390771e-308,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.076746003390771e-308,
        "sum_squared": 0.0,
        "min": 1.076746003390771e-308,
        "max": 1.076746003390771e-308,
        "mean": 1.076746003390771e-308,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 22.636363636363637,
        "sum_squared": 512.404958677686,
        "min": 22.636363636363637,
        "max": 22.636363636363637,
        "mean": 22.636363636363637,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 22.636363636363637,
        "sum_squared": 512.404958677686,
        "min": 22.636363636363637,
        "max": 22.636363636363637,
        "mean": 22.636363636363637,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 39.0,
        "sum_squared": 1521.0,
        "min": 39.0,
        "max": 39.0,
        "mean": 39.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 11.0,
        "sum_squared": 121.0,
        "min": 11.0,
        "max": 11.0,
        "mean": 11.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/narrative_qa:model=google_text-bison@001,additional_instructions=narrative_qa",
    "run_spec": {
      "name": "narrative_qa:model=google_text-bison@001,additional_instructions=narrative_qa",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": " Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Passage: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/text-bison@001",
        "model": "google/text-bison@001",
        "temperature": 0.0,
        "max_tokens": 100,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score",
              "rouge_l",
              "bleu_1",
              "bleu_4"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "narrative_qa"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 3480.2820512820513,
        "sum_squared": 12112363.156476002,
        "min": 3480.2820512820513,
        "max": 3480.2820512820513,
        "mean": 3480.2820512820513,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.408413752531394,
        "sum_squared": 1.9836292983195627,
        "min": 1.408413752531394,
        "max": 1.408413752531394,
        "mean": 1.408413752531394,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.2564102564102564,
        "sum_squared": 0.06574621959237342,
        "min": 0.2564102564102564,
        "max": 0.2564102564102564,
        "mean": 0.2564102564102564,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5128205128205128,
        "sum_squared": 0.2629848783694937,
        "min": 0.5128205128205128,
        "max": 0.5128205128205128,
        "mean": 0.5128205128205128,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7358718623424506,
        "sum_squared": 0.5415073977873466,
        "min": 0.7358718623424506,
        "max": 0.7358718623424506,
        "mean": 0.7358718623424506,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7400467472865663,
        "sum_squared": 0.5476691881694269,
        "min": 0.7400467472865663,
        "max": 0.7400467472865663,
        "mean": 0.7400467472865663,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test"
        },
        "count": 1,
        "sum": 0.4866862355268994,
        "sum_squared": 0.2368634918513446,
        "min": 0.4866862355268994,
        "max": 0.4866862355268994,
        "mean": 0.4866862355268994,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test"
        },
        "count": 1,
        "sum": 0.025351966880547996,
        "sum_squared": 0.0006427222247124025,
        "min": 0.025351966880547996,
        "max": 0.025351966880547996,
        "mean": 0.025351966880547996,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 3454.4545454545455,
        "sum_squared": 11933256.20661157,
        "min": 3454.4545454545455,
        "max": 3454.4545454545455,
        "mean": 3454.4545454545455,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.5085403485731645,
        "sum_squared": 2.275693983273245,
        "min": 1.5085403485731645,
        "max": 1.5085403485731645,
        "mean": 1.5085403485731645,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.36363636363636365,
        "sum_squared": 0.1322314049586777,
        "min": 0.36363636363636365,
        "max": 0.36363636363636365,
        "mean": 0.36363636363636365,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.6363636363636364,
        "sum_squared": 0.4049586776859504,
        "min": 0.6363636363636364,
        "max": 0.6363636363636364,
        "mean": 0.6363636363636364,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.793388429752066,
        "sum_squared": 0.629465200464449,
        "min": 0.793388429752066,
        "max": 0.793388429752066,
        "mean": 0.793388429752066,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.8586635586635587,
        "sum_squared": 0.7373031069767667,
        "min": 0.8586635586635587,
        "max": 0.8586635586635587,
        "mean": 0.8586635586635587,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.6585751460980557,
        "sum_squared": 0.43372122305807537,
        "min": 0.6585751460980557,
        "max": 0.6585751460980557,
        "mean": 0.6585751460980557,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.1221019939089528,
        "sum_squared": 0.014908896916541944,
        "min": 0.1221019939089528,
        "max": 0.1221019939089528,
        "mean": 0.1221019939089528,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3480.2820512820513,
        "sum_squared": 12112363.156476002,
        "min": 3480.2820512820513,
        "max": 3480.2820512820513,
        "mean": 3480.2820512820513,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3480.2820512820513,
        "sum_squared": 12112363.156476002,
        "min": 3480.2820512820513,
        "max": 3480.2820512820513,
        "mean": 3480.2820512820513,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.408413752531394,
        "sum_squared": 1.9836292983195627,
        "min": 1.408413752531394,
        "max": 1.408413752531394,
        "mean": 1.408413752531394,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.408413752531394,
        "sum_squared": 1.9836292983195627,
        "min": 1.408413752531394,
        "max": 1.408413752531394,
        "mean": 1.408413752531394,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2564102564102564,
        "sum_squared": 0.06574621959237342,
        "min": 0.2564102564102564,
        "max": 0.2564102564102564,
        "mean": 0.2564102564102564,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2564102564102564,
        "sum_squared": 0.06574621959237342,
        "min": 0.2564102564102564,
        "max": 0.2564102564102564,
        "mean": 0.2564102564102564,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5128205128205128,
        "sum_squared": 0.2629848783694937,
        "min": 0.5128205128205128,
        "max": 0.5128205128205128,
        "mean": 0.5128205128205128,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5128205128205128,
        "sum_squared": 0.2629848783694937,
        "min": 0.5128205128205128,
        "max": 0.5128205128205128,
        "mean": 0.5128205128205128,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7358718623424506,
        "sum_squared": 0.5415073977873466,
        "min": 0.7358718623424506,
        "max": 0.7358718623424506,
        "mean": 0.7358718623424506,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7358718623424506,
        "sum_squared": 0.5415073977873466,
        "min": 0.7358718623424506,
        "max": 0.7358718623424506,
        "mean": 0.7358718623424506,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7400467472865663,
        "sum_squared": 0.5476691881694269,
        "min": 0.7400467472865663,
        "max": 0.7400467472865663,
        "mean": 0.7400467472865663,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7400467472865663,
        "sum_squared": 0.5476691881694269,
        "min": 0.7400467472865663,
        "max": 0.7400467472865663,
        "mean": 0.7400467472865663,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4866862355268994,
        "sum_squared": 0.2368634918513446,
        "min": 0.4866862355268994,
        "max": 0.4866862355268994,
        "mean": 0.4866862355268994,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4866862355268994,
        "sum_squared": 0.2368634918513446,
        "min": 0.4866862355268994,
        "max": 0.4866862355268994,
        "mean": 0.4866862355268994,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.025351966880547996,
        "sum_squared": 0.0006427222247124025,
        "min": 0.025351966880547996,
        "max": 0.025351966880547996,
        "mean": 0.025351966880547996,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.025351966880547996,
        "sum_squared": 0.0006427222247124025,
        "min": 0.025351966880547996,
        "max": 0.025351966880547996,
        "mean": 0.025351966880547996,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3454.4545454545455,
        "sum_squared": 11933256.20661157,
        "min": 3454.4545454545455,
        "max": 3454.4545454545455,
        "mean": 3454.4545454545455,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3454.4545454545455,
        "sum_squared": 11933256.20661157,
        "min": 3454.4545454545455,
        "max": 3454.4545454545455,
        "mean": 3454.4545454545455,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.5085403485731645,
        "sum_squared": 2.275693983273245,
        "min": 1.5085403485731645,
        "max": 1.5085403485731645,
        "mean": 1.5085403485731645,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.5085403485731645,
        "sum_squared": 2.275693983273245,
        "min": 1.5085403485731645,
        "max": 1.5085403485731645,
        "mean": 1.5085403485731645,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.36363636363636365,
        "sum_squared": 0.1322314049586777,
        "min": 0.36363636363636365,
        "max": 0.36363636363636365,
        "mean": 0.36363636363636365,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.36363636363636365,
        "sum_squared": 0.1322314049586777,
        "min": 0.36363636363636365,
        "max": 0.36363636363636365,
        "mean": 0.36363636363636365,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6363636363636364,
        "sum_squared": 0.4049586776859504,
        "min": 0.6363636363636364,
        "max": 0.6363636363636364,
        "mean": 0.6363636363636364,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6363636363636364,
        "sum_squared": 0.4049586776859504,
        "min": 0.6363636363636364,
        "max": 0.6363636363636364,
        "mean": 0.6363636363636364,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.793388429752066,
        "sum_squared": 0.629465200464449,
        "min": 0.793388429752066,
        "max": 0.793388429752066,
        "mean": 0.793388429752066,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.793388429752066,
        "sum_squared": 0.629465200464449,
        "min": 0.793388429752066,
        "max": 0.793388429752066,
        "mean": 0.793388429752066,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8586635586635587,
        "sum_squared": 0.7373031069767667,
        "min": 0.8586635586635587,
        "max": 0.8586635586635587,
        "mean": 0.8586635586635587,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8586635586635587,
        "sum_squared": 0.7373031069767667,
        "min": 0.8586635586635587,
        "max": 0.8586635586635587,
        "mean": 0.8586635586635587,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6585751460980557,
        "sum_squared": 0.43372122305807537,
        "min": 0.6585751460980557,
        "max": 0.6585751460980557,
        "mean": 0.6585751460980557,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6585751460980557,
        "sum_squared": 0.43372122305807537,
        "min": 0.6585751460980557,
        "max": 0.6585751460980557,
        "mean": 0.6585751460980557,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.1221019939089528,
        "sum_squared": 0.014908896916541944,
        "min": 0.1221019939089528,
        "max": 0.1221019939089528,
        "mean": 0.1221019939089528,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.1221019939089528,
        "sum_squared": 0.014908896916541944,
        "min": 0.1221019939089528,
        "max": 0.1221019939089528,
        "mean": 0.1221019939089528,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 39.0,
        "sum_squared": 1521.0,
        "min": 39.0,
        "max": 39.0,
        "mean": 39.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 11.0,
        "sum_squared": 121.0,
        "min": 11.0,
        "max": 11.0,
        "mean": 11.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/narrative_qa:model=google_text-unicorn@001,additional_instructions=narrative_qa",
    "run_spec": {
      "name": "narrative_qa:model=google_text-unicorn@001,additional_instructions=narrative_qa",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": " Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Passage: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/text-unicorn@001",
        "model": "google/text-unicorn@001",
        "temperature": 0.0,
        "max_tokens": 100,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score",
              "rouge_l",
              "bleu_1",
              "bleu_4"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "narrative_qa"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 3480.2820512820513,
        "sum_squared": 12112363.156476002,
        "min": 3480.2820512820513,
        "max": 3480.2820512820513,
        "mean": 3480.2820512820513,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 2.3497891303820486,
        "sum_squared": 5.5215089572616245,
        "min": 2.3497891303820486,
        "max": 2.3497891303820486,
        "mean": 2.3497891303820486,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.15384615384615385,
        "sum_squared": 0.02366863905325444,
        "min": 0.15384615384615385,
        "max": 0.15384615384615385,
        "mean": 0.15384615384615385,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.4358974358974359,
        "sum_squared": 0.19000657462195925,
        "min": 0.4358974358974359,
        "max": 0.4358974358974359,
        "mean": 0.4358974358974359,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5868778280542986,
        "sum_squared": 0.3444255850617308,
        "min": 0.5868778280542986,
        "max": 0.5868778280542986,
        "mean": 0.5868778280542986,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6090558111018469,
        "sum_squared": 0.37094898103692864,
        "min": 0.6090558111018469,
        "max": 0.6090558111018469,
        "mean": 0.6090558111018469,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test"
        },
        "count": 1,
        "sum": 0.36269585595758447,
        "sum_squared": 0.13154828392880485,
        "min": 0.36269585595758447,
        "max": 0.36269585595758447,
        "mean": 0.36269585595758447,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test"
        },
        "count": 1,
        "sum": 0.019230769230769232,
        "sum_squared": 0.00036982248520710064,
        "min": 0.019230769230769232,
        "max": 0.019230769230769232,
        "mean": 0.019230769230769232,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 3454.4545454545455,
        "sum_squared": 11933256.20661157,
        "min": 3454.4545454545455,
        "max": 3454.4545454545455,
        "mean": 3454.4545454545455,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.549782081083818,
        "sum_squared": 6.501388661016125,
        "min": 2.549782081083818,
        "max": 2.549782081083818,
        "mean": 2.549782081083818,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.09090909090909091,
        "sum_squared": 0.008264462809917356,
        "min": 0.09090909090909091,
        "max": 0.09090909090909091,
        "mean": 0.09090909090909091,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.45454545454545453,
        "sum_squared": 0.20661157024793386,
        "min": 0.45454545454545453,
        "max": 0.45454545454545453,
        "mean": 0.45454545454545453,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.6232323232323232,
        "sum_squared": 0.388418528721559,
        "min": 0.6232323232323232,
        "max": 0.6232323232323232,
        "mean": 0.6232323232323232,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.3850599854375863,
        "sum_squared": 0.14827119238519418,
        "min": 0.3850599854375863,
        "max": 0.3850599854375863,
        "mean": 0.3850599854375863,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.07080007118830954,
        "sum_squared": 0.0050126500802696984,
        "min": 0.07080007118830954,
        "max": 0.07080007118830954,
        "mean": 0.07080007118830954,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3480.2820512820513,
        "sum_squared": 12112363.156476002,
        "min": 3480.2820512820513,
        "max": 3480.2820512820513,
        "mean": 3480.2820512820513,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3480.2820512820513,
        "sum_squared": 12112363.156476002,
        "min": 3480.2820512820513,
        "max": 3480.2820512820513,
        "mean": 3480.2820512820513,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.3497891303820486,
        "sum_squared": 5.5215089572616245,
        "min": 2.3497891303820486,
        "max": 2.3497891303820486,
        "mean": 2.3497891303820486,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.3497891303820486,
        "sum_squared": 5.5215089572616245,
        "min": 2.3497891303820486,
        "max": 2.3497891303820486,
        "mean": 2.3497891303820486,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.15384615384615385,
        "sum_squared": 0.02366863905325444,
        "min": 0.15384615384615385,
        "max": 0.15384615384615385,
        "mean": 0.15384615384615385,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.15384615384615385,
        "sum_squared": 0.02366863905325444,
        "min": 0.15384615384615385,
        "max": 0.15384615384615385,
        "mean": 0.15384615384615385,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4358974358974359,
        "sum_squared": 0.19000657462195925,
        "min": 0.4358974358974359,
        "max": 0.4358974358974359,
        "mean": 0.4358974358974359,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4358974358974359,
        "sum_squared": 0.19000657462195925,
        "min": 0.4358974358974359,
        "max": 0.4358974358974359,
        "mean": 0.4358974358974359,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5868778280542986,
        "sum_squared": 0.3444255850617308,
        "min": 0.5868778280542986,
        "max": 0.5868778280542986,
        "mean": 0.5868778280542986,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5868778280542986,
        "sum_squared": 0.3444255850617308,
        "min": 0.5868778280542986,
        "max": 0.5868778280542986,
        "mean": 0.5868778280542986,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6090558111018469,
        "sum_squared": 0.37094898103692864,
        "min": 0.6090558111018469,
        "max": 0.6090558111018469,
        "mean": 0.6090558111018469,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6090558111018469,
        "sum_squared": 0.37094898103692864,
        "min": 0.6090558111018469,
        "max": 0.6090558111018469,
        "mean": 0.6090558111018469,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.36269585595758447,
        "sum_squared": 0.13154828392880485,
        "min": 0.36269585595758447,
        "max": 0.36269585595758447,
        "mean": 0.36269585595758447,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.36269585595758447,
        "sum_squared": 0.13154828392880485,
        "min": 0.36269585595758447,
        "max": 0.36269585595758447,
        "mean": 0.36269585595758447,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.019230769230769232,
        "sum_squared": 0.00036982248520710064,
        "min": 0.019230769230769232,
        "max": 0.019230769230769232,
        "mean": 0.019230769230769232,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.019230769230769232,
        "sum_squared": 0.00036982248520710064,
        "min": 0.019230769230769232,
        "max": 0.019230769230769232,
        "mean": 0.019230769230769232,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3454.4545454545455,
        "sum_squared": 11933256.20661157,
        "min": 3454.4545454545455,
        "max": 3454.4545454545455,
        "mean": 3454.4545454545455,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3454.4545454545455,
        "sum_squared": 11933256.20661157,
        "min": 3454.4545454545455,
        "max": 3454.4545454545455,
        "mean": 3454.4545454545455,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.549782081083818,
        "sum_squared": 6.501388661016125,
        "min": 2.549782081083818,
        "max": 2.549782081083818,
        "mean": 2.549782081083818,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.549782081083818,
        "sum_squared": 6.501388661016125,
        "min": 2.549782081083818,
        "max": 2.549782081083818,
        "mean": 2.549782081083818,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.09090909090909091,
        "sum_squared": 0.008264462809917356,
        "min": 0.09090909090909091,
        "max": 0.09090909090909091,
        "mean": 0.09090909090909091,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.09090909090909091,
        "sum_squared": 0.008264462809917356,
        "min": 0.09090909090909091,
        "max": 0.09090909090909091,
        "mean": 0.09090909090909091,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.45454545454545453,
        "sum_squared": 0.20661157024793386,
        "min": 0.45454545454545453,
        "max": 0.45454545454545453,
        "mean": 0.45454545454545453,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.45454545454545453,
        "sum_squared": 0.20661157024793386,
        "min": 0.45454545454545453,
        "max": 0.45454545454545453,
        "mean": 0.45454545454545453,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6666666666666666,
        "sum_squared": 0.4444444444444444,
        "min": 0.6666666666666666,
        "max": 0.6666666666666666,
        "mean": 0.6666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6232323232323232,
        "sum_squared": 0.388418528721559,
        "min": 0.6232323232323232,
        "max": 0.6232323232323232,
        "mean": 0.6232323232323232,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6232323232323232,
        "sum_squared": 0.388418528721559,
        "min": 0.6232323232323232,
        "max": 0.6232323232323232,
        "mean": 0.6232323232323232,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3850599854375863,
        "sum_squared": 0.14827119238519418,
        "min": 0.3850599854375863,
        "max": 0.3850599854375863,
        "mean": 0.3850599854375863,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3850599854375863,
        "sum_squared": 0.14827119238519418,
        "min": 0.3850599854375863,
        "max": 0.3850599854375863,
        "mean": 0.3850599854375863,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.07080007118830954,
        "sum_squared": 0.0050126500802696984,
        "min": 0.07080007118830954,
        "max": 0.07080007118830954,
        "mean": 0.07080007118830954,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.07080007118830954,
        "sum_squared": 0.0050126500802696984,
        "min": 0.07080007118830954,
        "max": 0.07080007118830954,
        "mean": 0.07080007118830954,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 39.0,
        "sum_squared": 1521.0,
        "min": 39.0,
        "max": 39.0,
        "mean": 39.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 11.0,
        "sum_squared": 121.0,
        "min": 11.0,
        "max": 11.0,
        "mean": 11.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/narrative_qa:model=meta_llama-2-7b,additional_instructions=narrative_qa",
    "run_spec": {
      "name": "narrative_qa:model=meta_llama-2-7b,additional_instructions=narrative_qa",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": " Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Passage: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-2-7b",
        "model": "meta/llama-2-7b",
        "temperature": 0.0,
        "max_tokens": 100,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score",
              "rouge_l",
              "bleu_1",
              "bleu_4"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "narrative_qa"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 3671.4615384615386,
        "sum_squared": 13479629.828402368,
        "min": 3671.4615384615386,
        "max": 3671.4615384615386,
        "mean": 3671.4615384615386,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.4449573846963735,
        "sum_squared": 2.0879018435885834,
        "min": 1.4449573846963735,
        "max": 1.4449573846963735,
        "mean": 1.4449573846963735,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 4.358974358974359,
        "sum_squared": 19.00065746219592,
        "min": 4.358974358974359,
        "max": 4.358974358974359,
        "mean": 4.358974358974359,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.3333333333333333,
        "sum_squared": 0.1111111111111111,
        "min": 0.3333333333333333,
        "max": 0.3333333333333333,
        "mean": 0.3333333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.48717948717948717,
        "sum_squared": 0.2373438527284681,
        "min": 0.48717948717948717,
        "max": 0.48717948717948717,
        "mean": 0.48717948717948717,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7104635653040035,
        "sum_squared": 0.504758477624476,
        "min": 0.7104635653040035,
        "max": 0.7104635653040035,
        "mean": 0.7104635653040035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7098909039623417,
        "sum_squared": 0.5039450955284706,
        "min": 0.7098909039623417,
        "max": 0.7098909039623417,
        "mean": 0.7098909039623417,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5624194727683055,
        "sum_squared": 0.31631566334897876,
        "min": 0.5624194727683055,
        "max": 0.5624194727683055,
        "mean": 0.5624194727683055,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test"
        },
        "count": 1,
        "sum": 0.05738985976942292,
        "sum_squared": 0.0032935960043540278,
        "min": 0.05738985976942292,
        "max": 0.05738985976942292,
        "mean": 0.05738985976942292,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 29.743589743589745,
        "sum_squared": 884.681130834977,
        "min": 29.743589743589745,
        "max": 29.743589743589745,
        "mean": 29.743589743589745,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 3593.3636363636365,
        "sum_squared": 12912262.223140497,
        "min": 3593.3636363636365,
        "max": 3593.3636363636365,
        "mean": 3593.3636363636365,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.4895370223305442,
        "sum_squared": 2.218720540893344,
        "min": 1.4895370223305442,
        "max": 1.4895370223305442,
        "mean": 1.4895370223305442,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 4.2727272727272725,
        "sum_squared": 18.256198347107436,
        "min": 4.2727272727272725,
        "max": 4.2727272727272725,
        "mean": 4.2727272727272725,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.2727272727272727,
        "sum_squared": 0.07438016528925619,
        "min": 0.2727272727272727,
        "max": 0.2727272727272727,
        "mean": 0.2727272727272727,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.5454545454545454,
        "sum_squared": 0.29752066115702475,
        "min": 0.5454545454545454,
        "max": 0.5454545454545454,
        "mean": 0.5454545454545454,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.7696969696969697,
        "sum_squared": 0.5924334251606979,
        "min": 0.7696969696969697,
        "max": 0.7696969696969697,
        "mean": 0.7696969696969697,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.8405483405483406,
        "sum_squared": 0.7065215127985691,
        "min": 0.8405483405483406,
        "max": 0.8405483405483406,
        "mean": 0.8405483405483406,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.5726611281552104,
        "sum_squared": 0.32794076769999836,
        "min": 0.5726611281552104,
        "max": 0.5726611281552104,
        "mean": 0.5726611281552104,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.1496071529837806,
        "sum_squared": 0.022382300223912334,
        "min": 0.1496071529837806,
        "max": 0.1496071529837806,
        "mean": 0.1496071529837806,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 17.818181818181817,
        "sum_squared": 317.48760330578506,
        "min": 17.818181818181817,
        "max": 17.818181818181817,
        "mean": 17.818181818181817,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3671.4615384615386,
        "sum_squared": 13479629.828402368,
        "min": 3671.4615384615386,
        "max": 3671.4615384615386,
        "mean": 3671.4615384615386,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3671.4615384615386,
        "sum_squared": 13479629.828402368,
        "min": 3671.4615384615386,
        "max": 3671.4615384615386,
        "mean": 3671.4615384615386,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.4449573846963735,
        "sum_squared": 2.0879018435885834,
        "min": 1.4449573846963735,
        "max": 1.4449573846963735,
        "mean": 1.4449573846963735,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.4449573846963735,
        "sum_squared": 2.0879018435885834,
        "min": 1.4449573846963735,
        "max": 1.4449573846963735,
        "mean": 1.4449573846963735,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.358974358974359,
        "sum_squared": 19.00065746219592,
        "min": 4.358974358974359,
        "max": 4.358974358974359,
        "mean": 4.358974358974359,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.358974358974359,
        "sum_squared": 19.00065746219592,
        "min": 4.358974358974359,
        "max": 4.358974358974359,
        "mean": 4.358974358974359,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3333333333333333,
        "sum_squared": 0.1111111111111111,
        "min": 0.3333333333333333,
        "max": 0.3333333333333333,
        "mean": 0.3333333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3333333333333333,
        "sum_squared": 0.1111111111111111,
        "min": 0.3333333333333333,
        "max": 0.3333333333333333,
        "mean": 0.3333333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48717948717948717,
        "sum_squared": 0.2373438527284681,
        "min": 0.48717948717948717,
        "max": 0.48717948717948717,
        "mean": 0.48717948717948717,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48717948717948717,
        "sum_squared": 0.2373438527284681,
        "min": 0.48717948717948717,
        "max": 0.48717948717948717,
        "mean": 0.48717948717948717,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7104635653040035,
        "sum_squared": 0.504758477624476,
        "min": 0.7104635653040035,
        "max": 0.7104635653040035,
        "mean": 0.7104635653040035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7104635653040035,
        "sum_squared": 0.504758477624476,
        "min": 0.7104635653040035,
        "max": 0.7104635653040035,
        "mean": 0.7104635653040035,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7098909039623417,
        "sum_squared": 0.5039450955284706,
        "min": 0.7098909039623417,
        "max": 0.7098909039623417,
        "mean": 0.7098909039623417,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7098909039623417,
        "sum_squared": 0.5039450955284706,
        "min": 0.7098909039623417,
        "max": 0.7098909039623417,
        "mean": 0.7098909039623417,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5624194727683055,
        "sum_squared": 0.31631566334897876,
        "min": 0.5624194727683055,
        "max": 0.5624194727683055,
        "mean": 0.5624194727683055,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5624194727683055,
        "sum_squared": 0.31631566334897876,
        "min": 0.5624194727683055,
        "max": 0.5624194727683055,
        "mean": 0.5624194727683055,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.05738985976942292,
        "sum_squared": 0.0032935960043540278,
        "min": 0.05738985976942292,
        "max": 0.05738985976942292,
        "mean": 0.05738985976942292,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.05738985976942292,
        "sum_squared": 0.0032935960043540278,
        "min": 0.05738985976942292,
        "max": 0.05738985976942292,
        "mean": 0.05738985976942292,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 29.743589743589745,
        "sum_squared": 884.681130834977,
        "min": 29.743589743589745,
        "max": 29.743589743589745,
        "mean": 29.743589743589745,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 29.743589743589745,
        "sum_squared": 884.681130834977,
        "min": 29.743589743589745,
        "max": 29.743589743589745,
        "mean": 29.743589743589745,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3593.3636363636365,
        "sum_squared": 12912262.223140497,
        "min": 3593.3636363636365,
        "max": 3593.3636363636365,
        "mean": 3593.3636363636365,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3593.3636363636365,
        "sum_squared": 12912262.223140497,
        "min": 3593.3636363636365,
        "max": 3593.3636363636365,
        "mean": 3593.3636363636365,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.4895370223305442,
        "sum_squared": 2.218720540893344,
        "min": 1.4895370223305442,
        "max": 1.4895370223305442,
        "mean": 1.4895370223305442,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.4895370223305442,
        "sum_squared": 2.218720540893344,
        "min": 1.4895370223305442,
        "max": 1.4895370223305442,
        "mean": 1.4895370223305442,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.2727272727272725,
        "sum_squared": 18.256198347107436,
        "min": 4.2727272727272725,
        "max": 4.2727272727272725,
        "mean": 4.2727272727272725,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.2727272727272725,
        "sum_squared": 18.256198347107436,
        "min": 4.2727272727272725,
        "max": 4.2727272727272725,
        "mean": 4.2727272727272725,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2727272727272727,
        "sum_squared": 0.07438016528925619,
        "min": 0.2727272727272727,
        "max": 0.2727272727272727,
        "mean": 0.2727272727272727,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2727272727272727,
        "sum_squared": 0.07438016528925619,
        "min": 0.2727272727272727,
        "max": 0.2727272727272727,
        "mean": 0.2727272727272727,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5454545454545454,
        "sum_squared": 0.29752066115702475,
        "min": 0.5454545454545454,
        "max": 0.5454545454545454,
        "mean": 0.5454545454545454,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5454545454545454,
        "sum_squared": 0.29752066115702475,
        "min": 0.5454545454545454,
        "max": 0.5454545454545454,
        "mean": 0.5454545454545454,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7696969696969697,
        "sum_squared": 0.5924334251606979,
        "min": 0.7696969696969697,
        "max": 0.7696969696969697,
        "mean": 0.7696969696969697,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7696969696969697,
        "sum_squared": 0.5924334251606979,
        "min": 0.7696969696969697,
        "max": 0.7696969696969697,
        "mean": 0.7696969696969697,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8405483405483406,
        "sum_squared": 0.7065215127985691,
        "min": 0.8405483405483406,
        "max": 0.8405483405483406,
        "mean": 0.8405483405483406,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8405483405483406,
        "sum_squared": 0.7065215127985691,
        "min": 0.8405483405483406,
        "max": 0.8405483405483406,
        "mean": 0.8405483405483406,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5726611281552104,
        "sum_squared": 0.32794076769999836,
        "min": 0.5726611281552104,
        "max": 0.5726611281552104,
        "mean": 0.5726611281552104,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5726611281552104,
        "sum_squared": 0.32794076769999836,
        "min": 0.5726611281552104,
        "max": 0.5726611281552104,
        "mean": 0.5726611281552104,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.1496071529837806,
        "sum_squared": 0.022382300223912334,
        "min": 0.1496071529837806,
        "max": 0.1496071529837806,
        "mean": 0.1496071529837806,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.1496071529837806,
        "sum_squared": 0.022382300223912334,
        "min": 0.1496071529837806,
        "max": 0.1496071529837806,
        "mean": 0.1496071529837806,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 17.818181818181817,
        "sum_squared": 317.48760330578506,
        "min": 17.818181818181817,
        "max": 17.818181818181817,
        "mean": 17.818181818181817,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 17.818181818181817,
        "sum_squared": 317.48760330578506,
        "min": 17.818181818181817,
        "max": 17.818181818181817,
        "mean": 17.818181818181817,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 39.0,
        "sum_squared": 1521.0,
        "min": 39.0,
        "max": 39.0,
        "mean": 39.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 11.0,
        "sum_squared": 121.0,
        "min": 11.0,
        "max": 11.0,
        "mean": 11.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/narrative_qa:model=microsoft_phi-2,additional_instructions=narrative_qa",
    "run_spec": {
      "name": "narrative_qa:model=microsoft_phi-2,additional_instructions=narrative_qa",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": " Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Passage: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/phi-2",
        "model": "microsoft/phi-2",
        "temperature": 0.0,
        "max_tokens": 100,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score",
              "rouge_l",
              "bleu_1",
              "bleu_4"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "narrative_qa"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1717.6666666666667,
        "sum_squared": 2950378.777777778,
        "min": 1717.6666666666667,
        "max": 1717.6666666666667,
        "mean": 1717.6666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5483537698403383,
        "sum_squared": 0.30069185689811073,
        "min": 0.5483537698403383,
        "max": 0.5483537698403383,
        "mean": 0.5483537698403383,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 2.051282051282051,
        "sum_squared": 4.207758053911899,
        "min": 2.051282051282051,
        "max": 2.051282051282051,
        "mean": 2.051282051282051,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.1794871794871795,
        "sum_squared": 0.032215647600262985,
        "min": 0.1794871794871795,
        "max": 0.1794871794871795,
        "mean": 0.1794871794871795,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.38461538461538464,
        "sum_squared": 0.14792899408284024,
        "min": 0.38461538461538464,
        "max": 0.38461538461538464,
        "mean": 0.38461538461538464,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6404732271248108,
        "sum_squared": 0.4102059546636695,
        "min": 0.6404732271248108,
        "max": 0.6404732271248108,
        "mean": 0.6404732271248108,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6533316635551568,
        "sum_squared": 0.42684226260374863,
        "min": 0.6533316635551568,
        "max": 0.6533316635551568,
        "mean": 0.6533316635551568,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test"
        },
        "count": 1,
        "sum": 0.4281173133480012,
        "sum_squared": 0.18328443398831062,
        "min": 0.4281173133480012,
        "max": 0.4281173133480012,
        "mean": 0.4281173133480012,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test"
        },
        "count": 1,
        "sum": 0.03874318458569219,
        "sum_squared": 0.0015010343518410169,
        "min": 0.03874318458569219,
        "max": 0.03874318458569219,
        "mean": 0.03874318458569219,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 25.82051282051282,
        "sum_squared": 666.6988823142669,
        "min": 25.82051282051282,
        "max": 25.82051282051282,
        "mean": 25.82051282051282,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1721.909090909091,
        "sum_squared": 2964970.917355372,
        "min": 1721.909090909091,
        "max": 1721.909090909091,
        "mean": 1721.909090909091,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.5653736157850786,
        "sum_squared": 0.3196473254258937,
        "min": 0.5653736157850786,
        "max": 0.5653736157850786,
        "mean": 0.5653736157850786,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.090909090909091,
        "sum_squared": 4.37190082644628,
        "min": 2.090909090909091,
        "max": 2.090909090909091,
        "mean": 2.090909090909091,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.18181818181818182,
        "sum_squared": 0.03305785123966942,
        "min": 0.18181818181818182,
        "max": 0.18181818181818182,
        "mean": 0.18181818181818182,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.45454545454545453,
        "sum_squared": 0.20661157024793386,
        "min": 0.45454545454545453,
        "max": 0.45454545454545453,
        "mean": 0.45454545454545453,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.7575757575757577,
        "sum_squared": 0.5739210284664832,
        "min": 0.7575757575757577,
        "max": 0.7575757575757577,
        "mean": 0.7575757575757577,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.7534271284271284,
        "sum_squared": 0.5676524378499487,
        "min": 0.7534271284271284,
        "max": 0.7534271284271284,
        "mean": 0.7534271284271284,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.5103042628099558,
        "sum_squared": 0.2604104406420124,
        "min": 0.5103042628099558,
        "max": 0.5103042628099558,
        "mean": 0.5103042628099558,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.116254616642855,
        "sum_squared": 0.013515135890777179,
        "min": 0.116254616642855,
        "max": 0.116254616642855,
        "mean": 0.116254616642855,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 26.09090909090909,
        "sum_squared": 680.7355371900826,
        "min": 26.09090909090909,
        "max": 26.09090909090909,
        "mean": 26.09090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1717.6666666666667,
        "sum_squared": 2950378.777777778,
        "min": 1717.6666666666667,
        "max": 1717.6666666666667,
        "mean": 1717.6666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1717.6666666666667,
        "sum_squared": 2950378.777777778,
        "min": 1717.6666666666667,
        "max": 1717.6666666666667,
        "mean": 1717.6666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5483537698403383,
        "sum_squared": 0.30069185689811073,
        "min": 0.5483537698403383,
        "max": 0.5483537698403383,
        "mean": 0.5483537698403383,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5483537698403383,
        "sum_squared": 0.30069185689811073,
        "min": 0.5483537698403383,
        "max": 0.5483537698403383,
        "mean": 0.5483537698403383,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.051282051282051,
        "sum_squared": 4.207758053911899,
        "min": 2.051282051282051,
        "max": 2.051282051282051,
        "mean": 2.051282051282051,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.051282051282051,
        "sum_squared": 4.207758053911899,
        "min": 2.051282051282051,
        "max": 2.051282051282051,
        "mean": 2.051282051282051,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.1794871794871795,
        "sum_squared": 0.032215647600262985,
        "min": 0.1794871794871795,
        "max": 0.1794871794871795,
        "mean": 0.1794871794871795,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.1794871794871795,
        "sum_squared": 0.032215647600262985,
        "min": 0.1794871794871795,
        "max": 0.1794871794871795,
        "mean": 0.1794871794871795,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.38461538461538464,
        "sum_squared": 0.14792899408284024,
        "min": 0.38461538461538464,
        "max": 0.38461538461538464,
        "mean": 0.38461538461538464,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.38461538461538464,
        "sum_squared": 0.14792899408284024,
        "min": 0.38461538461538464,
        "max": 0.38461538461538464,
        "mean": 0.38461538461538464,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6404732271248108,
        "sum_squared": 0.4102059546636695,
        "min": 0.6404732271248108,
        "max": 0.6404732271248108,
        "mean": 0.6404732271248108,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6404732271248108,
        "sum_squared": 0.4102059546636695,
        "min": 0.6404732271248108,
        "max": 0.6404732271248108,
        "mean": 0.6404732271248108,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6533316635551568,
        "sum_squared": 0.42684226260374863,
        "min": 0.6533316635551568,
        "max": 0.6533316635551568,
        "mean": 0.6533316635551568,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6533316635551568,
        "sum_squared": 0.42684226260374863,
        "min": 0.6533316635551568,
        "max": 0.6533316635551568,
        "mean": 0.6533316635551568,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4281173133480012,
        "sum_squared": 0.18328443398831062,
        "min": 0.4281173133480012,
        "max": 0.4281173133480012,
        "mean": 0.4281173133480012,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4281173133480012,
        "sum_squared": 0.18328443398831062,
        "min": 0.4281173133480012,
        "max": 0.4281173133480012,
        "mean": 0.4281173133480012,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.03874318458569219,
        "sum_squared": 0.0015010343518410169,
        "min": 0.03874318458569219,
        "max": 0.03874318458569219,
        "mean": 0.03874318458569219,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.03874318458569219,
        "sum_squared": 0.0015010343518410169,
        "min": 0.03874318458569219,
        "max": 0.03874318458569219,
        "mean": 0.03874318458569219,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 25.82051282051282,
        "sum_squared": 666.6988823142669,
        "min": 25.82051282051282,
        "max": 25.82051282051282,
        "mean": 25.82051282051282,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 25.82051282051282,
        "sum_squared": 666.6988823142669,
        "min": 25.82051282051282,
        "max": 25.82051282051282,
        "mean": 25.82051282051282,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1721.909090909091,
        "sum_squared": 2964970.917355372,
        "min": 1721.909090909091,
        "max": 1721.909090909091,
        "mean": 1721.909090909091,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1721.909090909091,
        "sum_squared": 2964970.917355372,
        "min": 1721.909090909091,
        "max": 1721.909090909091,
        "mean": 1721.909090909091,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5653736157850786,
        "sum_squared": 0.3196473254258937,
        "min": 0.5653736157850786,
        "max": 0.5653736157850786,
        "mean": 0.5653736157850786,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5653736157850786,
        "sum_squared": 0.3196473254258937,
        "min": 0.5653736157850786,
        "max": 0.5653736157850786,
        "mean": 0.5653736157850786,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.090909090909091,
        "sum_squared": 4.37190082644628,
        "min": 2.090909090909091,
        "max": 2.090909090909091,
        "mean": 2.090909090909091,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.090909090909091,
        "sum_squared": 4.37190082644628,
        "min": 2.090909090909091,
        "max": 2.090909090909091,
        "mean": 2.090909090909091,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.18181818181818182,
        "sum_squared": 0.03305785123966942,
        "min": 0.18181818181818182,
        "max": 0.18181818181818182,
        "mean": 0.18181818181818182,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.18181818181818182,
        "sum_squared": 0.03305785123966942,
        "min": 0.18181818181818182,
        "max": 0.18181818181818182,
        "mean": 0.18181818181818182,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.45454545454545453,
        "sum_squared": 0.20661157024793386,
        "min": 0.45454545454545453,
        "max": 0.45454545454545453,
        "mean": 0.45454545454545453,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.45454545454545453,
        "sum_squared": 0.20661157024793386,
        "min": 0.45454545454545453,
        "max": 0.45454545454545453,
        "mean": 0.45454545454545453,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7575757575757577,
        "sum_squared": 0.5739210284664832,
        "min": 0.7575757575757577,
        "max": 0.7575757575757577,
        "mean": 0.7575757575757577,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7575757575757577,
        "sum_squared": 0.5739210284664832,
        "min": 0.7575757575757577,
        "max": 0.7575757575757577,
        "mean": 0.7575757575757577,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7534271284271284,
        "sum_squared": 0.5676524378499487,
        "min": 0.7534271284271284,
        "max": 0.7534271284271284,
        "mean": 0.7534271284271284,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7534271284271284,
        "sum_squared": 0.5676524378499487,
        "min": 0.7534271284271284,
        "max": 0.7534271284271284,
        "mean": 0.7534271284271284,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5103042628099558,
        "sum_squared": 0.2604104406420124,
        "min": 0.5103042628099558,
        "max": 0.5103042628099558,
        "mean": 0.5103042628099558,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5103042628099558,
        "sum_squared": 0.2604104406420124,
        "min": 0.5103042628099558,
        "max": 0.5103042628099558,
        "mean": 0.5103042628099558,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.116254616642855,
        "sum_squared": 0.013515135890777179,
        "min": 0.116254616642855,
        "max": 0.116254616642855,
        "mean": 0.116254616642855,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.116254616642855,
        "sum_squared": 0.013515135890777179,
        "min": 0.116254616642855,
        "max": 0.116254616642855,
        "mean": 0.116254616642855,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 26.09090909090909,
        "sum_squared": 680.7355371900826,
        "min": 26.09090909090909,
        "max": 26.09090909090909,
        "mean": 26.09090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 26.09090909090909,
        "sum_squared": 680.7355371900826,
        "min": 26.09090909090909,
        "max": 26.09090909090909,
        "mean": 26.09090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 39.0,
        "sum_squared": 1521.0,
        "min": 39.0,
        "max": 39.0,
        "mean": 39.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 11.0,
        "sum_squared": 121.0,
        "min": 11.0,
        "max": 11.0,
        "mean": 11.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/narrative_qa:model=mistralai_mixtral-8x7b-32kseqlen,additional_instructions=narrative_qa",
    "run_spec": {
      "name": "narrative_qa:model=mistralai_mixtral-8x7b-32kseqlen,additional_instructions=narrative_qa",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": " Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Passage: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/mixtral-8x7b-32kseqlen",
        "model": "mistralai/mixtral-8x7b-32kseqlen",
        "temperature": 0.0,
        "max_tokens": 100,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score",
              "rouge_l",
              "bleu_1",
              "bleu_4"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "narrative_qa"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 3638.5641025641025,
        "sum_squared": 13239148.728468113,
        "min": 3638.5641025641025,
        "max": 3638.5641025641025,
        "mean": 3638.5641025641025,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.8982290365757086,
        "sum_squared": 0.8068154021477257,
        "min": 0.8982290365757086,
        "max": 0.8982290365757086,
        "mean": 0.8982290365757086,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 4.538461538461538,
        "sum_squared": 20.59763313609467,
        "min": 4.538461538461538,
        "max": 4.538461538461538,
        "mean": 4.538461538461538,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.28205128205128205,
        "sum_squared": 0.07955292570677186,
        "min": 0.28205128205128205,
        "max": 0.28205128205128205,
        "mean": 0.28205128205128205,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6153846153846154,
        "sum_squared": 0.37869822485207105,
        "min": 0.6153846153846154,
        "max": 0.6153846153846154,
        "mean": 0.6153846153846154,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test"
        },
        "count": 1,
        "sum": 0.8092498969017188,
        "sum_squared": 0.6548853956354425,
        "min": 0.8092498969017188,
        "max": 0.8092498969017188,
        "mean": 0.8092498969017188,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test"
        },
        "count": 1,
        "sum": 0.8023182230874539,
        "sum_squared": 0.6437145310982095,
        "min": 0.8023182230874539,
        "max": 0.8023182230874539,
        "mean": 0.8023182230874539,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5658146421334544,
        "sum_squared": 0.3201462092526091,
        "min": 0.5658146421334544,
        "max": 0.5658146421334544,
        "mean": 0.5658146421334544,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test"
        },
        "count": 1,
        "sum": 0.07506891129185964,
        "sum_squared": 0.005635341442545091,
        "min": 0.07506891129185964,
        "max": 0.07506891129185964,
        "mean": 0.07506891129185964,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 20.794871794871796,
        "sum_squared": 432.42669296515453,
        "min": 20.794871794871796,
        "max": 20.794871794871796,
        "mean": 20.794871794871796,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 3667.3636363636365,
        "sum_squared": 13449556.041322315,
        "min": 3667.3636363636365,
        "max": 3667.3636363636365,
        "mean": 3667.3636363636365,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.9675865606828169,
        "sum_squared": 0.9362237524140025,
        "min": 0.9675865606828169,
        "max": 0.9675865606828169,
        "mean": 0.9675865606828169,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 4.636363636363637,
        "sum_squared": 21.495867768595044,
        "min": 4.636363636363637,
        "max": 4.636363636363637,
        "mean": 4.636363636363637,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.18181818181818182,
        "sum_squared": 0.03305785123966942,
        "min": 0.18181818181818182,
        "max": 0.18181818181818182,
        "mean": 0.18181818181818182,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.5454545454545454,
        "sum_squared": 0.29752066115702475,
        "min": 0.5454545454545454,
        "max": 0.5454545454545454,
        "mean": 0.5454545454545454,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.778236914600551,
        "sum_squared": 0.6056526952469853,
        "min": 0.778236914600551,
        "max": 0.778236914600551,
        "mean": 0.778236914600551,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.8305250305250305,
        "sum_squared": 0.6897718263286028,
        "min": 0.8305250305250305,
        "max": 0.8305250305250305,
        "mean": 0.8305250305250305,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.5475660393344957,
        "sum_squared": 0.29982856743246644,
        "min": 0.5475660393344957,
        "max": 0.5475660393344957,
        "mean": 0.5475660393344957,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.1221019939089528,
        "sum_squared": 0.014908896916541944,
        "min": 0.1221019939089528,
        "max": 0.1221019939089528,
        "mean": 0.1221019939089528,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 18.0,
        "sum_squared": 324.0,
        "min": 18.0,
        "max": 18.0,
        "mean": 18.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3638.5641025641025,
        "sum_squared": 13239148.728468113,
        "min": 3638.5641025641025,
        "max": 3638.5641025641025,
        "mean": 3638.5641025641025,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3638.5641025641025,
        "sum_squared": 13239148.728468113,
        "min": 3638.5641025641025,
        "max": 3638.5641025641025,
        "mean": 3638.5641025641025,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8982290365757086,
        "sum_squared": 0.8068154021477257,
        "min": 0.8982290365757086,
        "max": 0.8982290365757086,
        "mean": 0.8982290365757086,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8982290365757086,
        "sum_squared": 0.8068154021477257,
        "min": 0.8982290365757086,
        "max": 0.8982290365757086,
        "mean": 0.8982290365757086,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.538461538461538,
        "sum_squared": 20.59763313609467,
        "min": 4.538461538461538,
        "max": 4.538461538461538,
        "mean": 4.538461538461538,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.538461538461538,
        "sum_squared": 20.59763313609467,
        "min": 4.538461538461538,
        "max": 4.538461538461538,
        "mean": 4.538461538461538,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.28205128205128205,
        "sum_squared": 0.07955292570677186,
        "min": 0.28205128205128205,
        "max": 0.28205128205128205,
        "mean": 0.28205128205128205,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.28205128205128205,
        "sum_squared": 0.07955292570677186,
        "min": 0.28205128205128205,
        "max": 0.28205128205128205,
        "mean": 0.28205128205128205,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6153846153846154,
        "sum_squared": 0.37869822485207105,
        "min": 0.6153846153846154,
        "max": 0.6153846153846154,
        "mean": 0.6153846153846154,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6153846153846154,
        "sum_squared": 0.37869822485207105,
        "min": 0.6153846153846154,
        "max": 0.6153846153846154,
        "mean": 0.6153846153846154,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8092498969017188,
        "sum_squared": 0.6548853956354425,
        "min": 0.8092498969017188,
        "max": 0.8092498969017188,
        "mean": 0.8092498969017188,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8092498969017188,
        "sum_squared": 0.6548853956354425,
        "min": 0.8092498969017188,
        "max": 0.8092498969017188,
        "mean": 0.8092498969017188,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8023182230874539,
        "sum_squared": 0.6437145310982095,
        "min": 0.8023182230874539,
        "max": 0.8023182230874539,
        "mean": 0.8023182230874539,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8023182230874539,
        "sum_squared": 0.6437145310982095,
        "min": 0.8023182230874539,
        "max": 0.8023182230874539,
        "mean": 0.8023182230874539,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5658146421334544,
        "sum_squared": 0.3201462092526091,
        "min": 0.5658146421334544,
        "max": 0.5658146421334544,
        "mean": 0.5658146421334544,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5658146421334544,
        "sum_squared": 0.3201462092526091,
        "min": 0.5658146421334544,
        "max": 0.5658146421334544,
        "mean": 0.5658146421334544,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.07506891129185964,
        "sum_squared": 0.005635341442545091,
        "min": 0.07506891129185964,
        "max": 0.07506891129185964,
        "mean": 0.07506891129185964,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.07506891129185964,
        "sum_squared": 0.005635341442545091,
        "min": 0.07506891129185964,
        "max": 0.07506891129185964,
        "mean": 0.07506891129185964,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 20.794871794871796,
        "sum_squared": 432.42669296515453,
        "min": 20.794871794871796,
        "max": 20.794871794871796,
        "mean": 20.794871794871796,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 20.794871794871796,
        "sum_squared": 432.42669296515453,
        "min": 20.794871794871796,
        "max": 20.794871794871796,
        "mean": 20.794871794871796,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3667.3636363636365,
        "sum_squared": 13449556.041322315,
        "min": 3667.3636363636365,
        "max": 3667.3636363636365,
        "mean": 3667.3636363636365,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3667.3636363636365,
        "sum_squared": 13449556.041322315,
        "min": 3667.3636363636365,
        "max": 3667.3636363636365,
        "mean": 3667.3636363636365,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9675865606828169,
        "sum_squared": 0.9362237524140025,
        "min": 0.9675865606828169,
        "max": 0.9675865606828169,
        "mean": 0.9675865606828169,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9675865606828169,
        "sum_squared": 0.9362237524140025,
        "min": 0.9675865606828169,
        "max": 0.9675865606828169,
        "mean": 0.9675865606828169,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.636363636363637,
        "sum_squared": 21.495867768595044,
        "min": 4.636363636363637,
        "max": 4.636363636363637,
        "mean": 4.636363636363637,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.636363636363637,
        "sum_squared": 21.495867768595044,
        "min": 4.636363636363637,
        "max": 4.636363636363637,
        "mean": 4.636363636363637,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.18181818181818182,
        "sum_squared": 0.03305785123966942,
        "min": 0.18181818181818182,
        "max": 0.18181818181818182,
        "mean": 0.18181818181818182,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.18181818181818182,
        "sum_squared": 0.03305785123966942,
        "min": 0.18181818181818182,
        "max": 0.18181818181818182,
        "mean": 0.18181818181818182,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5454545454545454,
        "sum_squared": 0.29752066115702475,
        "min": 0.5454545454545454,
        "max": 0.5454545454545454,
        "mean": 0.5454545454545454,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5454545454545454,
        "sum_squared": 0.29752066115702475,
        "min": 0.5454545454545454,
        "max": 0.5454545454545454,
        "mean": 0.5454545454545454,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.778236914600551,
        "sum_squared": 0.6056526952469853,
        "min": 0.778236914600551,
        "max": 0.778236914600551,
        "mean": 0.778236914600551,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.778236914600551,
        "sum_squared": 0.6056526952469853,
        "min": 0.778236914600551,
        "max": 0.778236914600551,
        "mean": 0.778236914600551,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8305250305250305,
        "sum_squared": 0.6897718263286028,
        "min": 0.8305250305250305,
        "max": 0.8305250305250305,
        "mean": 0.8305250305250305,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8305250305250305,
        "sum_squared": 0.6897718263286028,
        "min": 0.8305250305250305,
        "max": 0.8305250305250305,
        "mean": 0.8305250305250305,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5475660393344957,
        "sum_squared": 0.29982856743246644,
        "min": 0.5475660393344957,
        "max": 0.5475660393344957,
        "mean": 0.5475660393344957,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5475660393344957,
        "sum_squared": 0.29982856743246644,
        "min": 0.5475660393344957,
        "max": 0.5475660393344957,
        "mean": 0.5475660393344957,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.1221019939089528,
        "sum_squared": 0.014908896916541944,
        "min": 0.1221019939089528,
        "max": 0.1221019939089528,
        "mean": 0.1221019939089528,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.1221019939089528,
        "sum_squared": 0.014908896916541944,
        "min": 0.1221019939089528,
        "max": 0.1221019939089528,
        "mean": 0.1221019939089528,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 18.0,
        "sum_squared": 324.0,
        "min": 18.0,
        "max": 18.0,
        "mean": 18.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 18.0,
        "sum_squared": 324.0,
        "min": 18.0,
        "max": 18.0,
        "mean": 18.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 39.0,
        "sum_squared": 1521.0,
        "min": 39.0,
        "max": 39.0,
        "mean": 39.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 11.0,
        "sum_squared": 121.0,
        "min": 11.0,
        "max": 11.0,
        "mean": 11.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/narrative_qa:model=openai_gpt-3.5-turbo-0613,additional_instructions=narrative_qa",
    "run_spec": {
      "name": "narrative_qa:model=openai_gpt-3.5-turbo-0613,additional_instructions=narrative_qa",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": " Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Passage: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-3.5-turbo-0613",
        "model": "openai/gpt-3.5-turbo-0613",
        "temperature": 0.0,
        "max_tokens": 100,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score",
              "rouge_l",
              "bleu_1",
              "bleu_4"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "narrative_qa"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 3464.5641025641025,
        "sum_squared": 12003204.420775805,
        "min": 3464.5641025641025,
        "max": 3464.5641025641025,
        "mean": 3464.5641025641025,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 6.256410256410256,
        "sum_squared": 39.14266929651545,
        "min": 6.256410256410256,
        "max": 6.256410256410256,
        "mean": 6.256410256410256,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 6.256410256410256,
        "sum_squared": 39.14266929651545,
        "min": 6.256410256410256,
        "max": 6.256410256410256,
        "mean": 6.256410256410256,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6167610608614408,
        "sum_squared": 0.38039420619492986,
        "min": 0.6167610608614408,
        "max": 0.6167610608614408,
        "mean": 0.6167610608614408,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 4.948717948717949,
        "sum_squared": 24.489809335963184,
        "min": 4.948717948717949,
        "max": 4.948717948717949,
        "mean": 4.948717948717949,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.23076923076923078,
        "sum_squared": 0.053254437869822494,
        "min": 0.23076923076923078,
        "max": 0.23076923076923078,
        "mean": 0.23076923076923078,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.4358974358974359,
        "sum_squared": 0.19000657462195925,
        "min": 0.4358974358974359,
        "max": 0.4358974358974359,
        "mean": 0.4358974358974359,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7174581326165038,
        "sum_squared": 0.5147461720575608,
        "min": 0.7174581326165038,
        "max": 0.7174581326165038,
        "mean": 0.7174581326165038,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7193741462504141,
        "sum_squared": 0.5174991622935121,
        "min": 0.7193741462504141,
        "max": 0.7193741462504141,
        "mean": 0.7193741462504141,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5221346013461805,
        "sum_squared": 0.27262454192293484,
        "min": 0.5221346013461805,
        "max": 0.5221346013461805,
        "mean": 0.5221346013461805,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test"
        },
        "count": 1,
        "sum": 0.06503944773175543,
        "sum_squared": 0.004230129761251747,
        "min": 0.06503944773175543,
        "max": 0.06503944773175543,
        "mean": 0.06503944773175543,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 6.256410256410256,
        "sum_squared": 39.14266929651545,
        "min": 6.256410256410256,
        "max": 6.256410256410256,
        "mean": 6.256410256410256,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 27.58974358974359,
        "sum_squared": 761.1939513477976,
        "min": 27.58974358974359,
        "max": 27.58974358974359,
        "mean": 27.58974358974359,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 3464.090909090909,
        "sum_squared": 11999925.82644628,
        "min": 3464.090909090909,
        "max": 3464.090909090909,
        "mean": 3464.090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.2727272727272725,
        "sum_squared": 27.801652892561982,
        "min": 5.2727272727272725,
        "max": 5.2727272727272725,
        "mean": 5.2727272727272725,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.2727272727272725,
        "sum_squared": 27.801652892561982,
        "min": 5.2727272727272725,
        "max": 5.2727272727272725,
        "mean": 5.2727272727272725,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.7053152864629572,
        "sum_squared": 0.4974696533183233,
        "min": 0.7053152864629572,
        "max": 0.7053152864629572,
        "mean": 0.7053152864629572,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.18181818181818182,
        "sum_squared": 0.03305785123966942,
        "min": 0.18181818181818182,
        "max": 0.18181818181818182,
        "mean": 0.18181818181818182,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.36363636363636365,
        "sum_squared": 0.1322314049586777,
        "min": 0.36363636363636365,
        "max": 0.36363636363636365,
        "mean": 0.36363636363636365,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.6914600550964187,
        "sum_squared": 0.4781170077939424,
        "min": 0.6914600550964187,
        "max": 0.6914600550964187,
        "mean": 0.6914600550964187,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.7630924630924631,
        "sum_squared": 0.5823101072285223,
        "min": 0.7630924630924631,
        "max": 0.7630924630924631,
        "mean": 0.7630924630924631,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.47949041914142077,
        "sum_squared": 0.22991106204841535,
        "min": 0.47949041914142077,
        "max": 0.47949041914142077,
        "mean": 0.47949041914142077,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.13882378668947293,
        "sum_squared": 0.019272043750804282,
        "min": 0.13882378668947293,
        "max": 0.13882378668947293,
        "mean": 0.13882378668947293,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.2727272727272725,
        "sum_squared": 27.801652892561982,
        "min": 5.2727272727272725,
        "max": 5.2727272727272725,
        "mean": 5.2727272727272725,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 20.454545454545453,
        "sum_squared": 418.3884297520661,
        "min": 20.454545454545453,
        "max": 20.454545454545453,
        "mean": 20.454545454545453,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3464.5641025641025,
        "sum_squared": 12003204.420775805,
        "min": 3464.5641025641025,
        "max": 3464.5641025641025,
        "mean": 3464.5641025641025,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3464.5641025641025,
        "sum_squared": 12003204.420775805,
        "min": 3464.5641025641025,
        "max": 3464.5641025641025,
        "mean": 3464.5641025641025,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 6.256410256410256,
        "sum_squared": 39.14266929651545,
        "min": 6.256410256410256,
        "max": 6.256410256410256,
        "mean": 6.256410256410256,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 6.256410256410256,
        "sum_squared": 39.14266929651545,
        "min": 6.256410256410256,
        "max": 6.256410256410256,
        "mean": 6.256410256410256,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 6.256410256410256,
        "sum_squared": 39.14266929651545,
        "min": 6.256410256410256,
        "max": 6.256410256410256,
        "mean": 6.256410256410256,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 6.256410256410256,
        "sum_squared": 39.14266929651545,
        "min": 6.256410256410256,
        "max": 6.256410256410256,
        "mean": 6.256410256410256,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6167610608614408,
        "sum_squared": 0.38039420619492986,
        "min": 0.6167610608614408,
        "max": 0.6167610608614408,
        "mean": 0.6167610608614408,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6167610608614408,
        "sum_squared": 0.38039420619492986,
        "min": 0.6167610608614408,
        "max": 0.6167610608614408,
        "mean": 0.6167610608614408,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.948717948717949,
        "sum_squared": 24.489809335963184,
        "min": 4.948717948717949,
        "max": 4.948717948717949,
        "mean": 4.948717948717949,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.948717948717949,
        "sum_squared": 24.489809335963184,
        "min": 4.948717948717949,
        "max": 4.948717948717949,
        "mean": 4.948717948717949,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.23076923076923078,
        "sum_squared": 0.053254437869822494,
        "min": 0.23076923076923078,
        "max": 0.23076923076923078,
        "mean": 0.23076923076923078,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.23076923076923078,
        "sum_squared": 0.053254437869822494,
        "min": 0.23076923076923078,
        "max": 0.23076923076923078,
        "mean": 0.23076923076923078,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4358974358974359,
        "sum_squared": 0.19000657462195925,
        "min": 0.4358974358974359,
        "max": 0.4358974358974359,
        "mean": 0.4358974358974359,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4358974358974359,
        "sum_squared": 0.19000657462195925,
        "min": 0.4358974358974359,
        "max": 0.4358974358974359,
        "mean": 0.4358974358974359,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7174581326165038,
        "sum_squared": 0.5147461720575608,
        "min": 0.7174581326165038,
        "max": 0.7174581326165038,
        "mean": 0.7174581326165038,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7174581326165038,
        "sum_squared": 0.5147461720575608,
        "min": 0.7174581326165038,
        "max": 0.7174581326165038,
        "mean": 0.7174581326165038,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7193741462504141,
        "sum_squared": 0.5174991622935121,
        "min": 0.7193741462504141,
        "max": 0.7193741462504141,
        "mean": 0.7193741462504141,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7193741462504141,
        "sum_squared": 0.5174991622935121,
        "min": 0.7193741462504141,
        "max": 0.7193741462504141,
        "mean": 0.7193741462504141,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5221346013461805,
        "sum_squared": 0.27262454192293484,
        "min": 0.5221346013461805,
        "max": 0.5221346013461805,
        "mean": 0.5221346013461805,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5221346013461805,
        "sum_squared": 0.27262454192293484,
        "min": 0.5221346013461805,
        "max": 0.5221346013461805,
        "mean": 0.5221346013461805,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.06503944773175543,
        "sum_squared": 0.004230129761251747,
        "min": 0.06503944773175543,
        "max": 0.06503944773175543,
        "mean": 0.06503944773175543,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.06503944773175543,
        "sum_squared": 0.004230129761251747,
        "min": 0.06503944773175543,
        "max": 0.06503944773175543,
        "mean": 0.06503944773175543,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 6.256410256410256,
        "sum_squared": 39.14266929651545,
        "min": 6.256410256410256,
        "max": 6.256410256410256,
        "mean": 6.256410256410256,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 6.256410256410256,
        "sum_squared": 39.14266929651545,
        "min": 6.256410256410256,
        "max": 6.256410256410256,
        "mean": 6.256410256410256,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 27.58974358974359,
        "sum_squared": 761.1939513477976,
        "min": 27.58974358974359,
        "max": 27.58974358974359,
        "mean": 27.58974358974359,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 27.58974358974359,
        "sum_squared": 761.1939513477976,
        "min": 27.58974358974359,
        "max": 27.58974358974359,
        "mean": 27.58974358974359,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3464.090909090909,
        "sum_squared": 11999925.82644628,
        "min": 3464.090909090909,
        "max": 3464.090909090909,
        "mean": 3464.090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3464.090909090909,
        "sum_squared": 11999925.82644628,
        "min": 3464.090909090909,
        "max": 3464.090909090909,
        "mean": 3464.090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.2727272727272725,
        "sum_squared": 27.801652892561982,
        "min": 5.2727272727272725,
        "max": 5.2727272727272725,
        "mean": 5.2727272727272725,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.2727272727272725,
        "sum_squared": 27.801652892561982,
        "min": 5.2727272727272725,
        "max": 5.2727272727272725,
        "mean": 5.2727272727272725,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.2727272727272725,
        "sum_squared": 27.801652892561982,
        "min": 5.2727272727272725,
        "max": 5.2727272727272725,
        "mean": 5.2727272727272725,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.2727272727272725,
        "sum_squared": 27.801652892561982,
        "min": 5.2727272727272725,
        "max": 5.2727272727272725,
        "mean": 5.2727272727272725,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7053152864629572,
        "sum_squared": 0.4974696533183233,
        "min": 0.7053152864629572,
        "max": 0.7053152864629572,
        "mean": 0.7053152864629572,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7053152864629572,
        "sum_squared": 0.4974696533183233,
        "min": 0.7053152864629572,
        "max": 0.7053152864629572,
        "mean": 0.7053152864629572,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.18181818181818182,
        "sum_squared": 0.03305785123966942,
        "min": 0.18181818181818182,
        "max": 0.18181818181818182,
        "mean": 0.18181818181818182,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.18181818181818182,
        "sum_squared": 0.03305785123966942,
        "min": 0.18181818181818182,
        "max": 0.18181818181818182,
        "mean": 0.18181818181818182,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.36363636363636365,
        "sum_squared": 0.1322314049586777,
        "min": 0.36363636363636365,
        "max": 0.36363636363636365,
        "mean": 0.36363636363636365,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.36363636363636365,
        "sum_squared": 0.1322314049586777,
        "min": 0.36363636363636365,
        "max": 0.36363636363636365,
        "mean": 0.36363636363636365,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6914600550964187,
        "sum_squared": 0.4781170077939424,
        "min": 0.6914600550964187,
        "max": 0.6914600550964187,
        "mean": 0.6914600550964187,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6914600550964187,
        "sum_squared": 0.4781170077939424,
        "min": 0.6914600550964187,
        "max": 0.6914600550964187,
        "mean": 0.6914600550964187,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7630924630924631,
        "sum_squared": 0.5823101072285223,
        "min": 0.7630924630924631,
        "max": 0.7630924630924631,
        "mean": 0.7630924630924631,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7630924630924631,
        "sum_squared": 0.5823101072285223,
        "min": 0.7630924630924631,
        "max": 0.7630924630924631,
        "mean": 0.7630924630924631,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47949041914142077,
        "sum_squared": 0.22991106204841535,
        "min": 0.47949041914142077,
        "max": 0.47949041914142077,
        "mean": 0.47949041914142077,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47949041914142077,
        "sum_squared": 0.22991106204841535,
        "min": 0.47949041914142077,
        "max": 0.47949041914142077,
        "mean": 0.47949041914142077,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.13882378668947293,
        "sum_squared": 0.019272043750804282,
        "min": 0.13882378668947293,
        "max": 0.13882378668947293,
        "mean": 0.13882378668947293,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.13882378668947293,
        "sum_squared": 0.019272043750804282,
        "min": 0.13882378668947293,
        "max": 0.13882378668947293,
        "mean": 0.13882378668947293,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.2727272727272725,
        "sum_squared": 27.801652892561982,
        "min": 5.2727272727272725,
        "max": 5.2727272727272725,
        "mean": 5.2727272727272725,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.2727272727272725,
        "sum_squared": 27.801652892561982,
        "min": 5.2727272727272725,
        "max": 5.2727272727272725,
        "mean": 5.2727272727272725,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 20.454545454545453,
        "sum_squared": 418.3884297520661,
        "min": 20.454545454545453,
        "max": 20.454545454545453,
        "mean": 20.454545454545453,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 20.454545454545453,
        "sum_squared": 418.3884297520661,
        "min": 20.454545454545453,
        "max": 20.454545454545453,
        "mean": 20.454545454545453,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 39.0,
        "sum_squared": 1521.0,
        "min": 39.0,
        "max": 39.0,
        "mean": 39.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 11.0,
        "sum_squared": 121.0,
        "min": 11.0,
        "max": 11.0,
        "mean": 11.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/narrative_qa:model=openai_gpt-4-1106-preview,additional_instructions=narrative_qa",
    "run_spec": {
      "name": "narrative_qa:model=openai_gpt-4-1106-preview,additional_instructions=narrative_qa",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": " Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Passage: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-4-1106-preview",
        "model": "openai/gpt-4-1106-preview",
        "temperature": 0.0,
        "max_tokens": 100,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score",
              "rouge_l",
              "bleu_1",
              "bleu_4"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "narrative_qa"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 3492.358974358974,
        "sum_squared": 12196571.205785666,
        "min": 3492.358974358974,
        "max": 3492.358974358974,
        "mean": 3492.358974358974,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 5.948717948717949,
        "sum_squared": 35.387245233399085,
        "min": 5.948717948717949,
        "max": 5.948717948717949,
        "mean": 5.948717948717949,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 5.948717948717949,
        "sum_squared": 35.387245233399085,
        "min": 5.948717948717949,
        "max": 5.948717948717949,
        "mean": 5.948717948717949,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.1927946286323743,
        "sum_squared": 1.4227590260942438,
        "min": 1.1927946286323743,
        "max": 1.1927946286323743,
        "mean": 1.1927946286323743,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.20512820512820512,
        "sum_squared": 0.042077580539118996,
        "min": 0.20512820512820512,
        "max": 0.20512820512820512,
        "mean": 0.20512820512820512,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5128205128205128,
        "sum_squared": 0.2629848783694937,
        "min": 0.5128205128205128,
        "max": 0.5128205128205128,
        "mean": 0.5128205128205128,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test"
        },
        "count": 1,
        "sum": 0.746199366787602,
        "sum_squared": 0.5568134949942182,
        "min": 0.746199366787602,
        "max": 0.746199366787602,
        "mean": 0.746199366787602,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7398372385610926,
        "sum_squared": 0.5473591395617031,
        "min": 0.7398372385610926,
        "max": 0.7398372385610926,
        "mean": 0.7398372385610926,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test"
        },
        "count": 1,
        "sum": 0.49287117061307545,
        "sum_squared": 0.24292199082150331,
        "min": 0.49287117061307545,
        "max": 0.49287117061307545,
        "mean": 0.49287117061307545,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test"
        },
        "count": 1,
        "sum": 0.06879445340983803,
        "sum_squared": 0.004732676819958375,
        "min": 0.06879445340983803,
        "max": 0.06879445340983803,
        "mean": 0.06879445340983803,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 5.948717948717949,
        "sum_squared": 35.387245233399085,
        "min": 5.948717948717949,
        "max": 5.948717948717949,
        "mean": 5.948717948717949,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 24.17948717948718,
        "sum_squared": 584.6476002629848,
        "min": 24.17948717948718,
        "max": 24.17948717948718,
        "mean": 24.17948717948718,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 3464.090909090909,
        "sum_squared": 11999925.82644628,
        "min": 3464.090909090909,
        "max": 3464.090909090909,
        "mean": 3464.090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 4.545454545454546,
        "sum_squared": 20.661157024793393,
        "min": 4.545454545454546,
        "max": 4.545454545454546,
        "mean": 4.545454545454546,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 4.545454545454546,
        "sum_squared": 20.661157024793393,
        "min": 4.545454545454546,
        "max": 4.545454545454546,
        "mean": 4.545454545454546,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.1362367976795544,
        "sum_squared": 1.2910340604010888,
        "min": 1.1362367976795544,
        "max": 1.1362367976795544,
        "mean": 1.1362367976795544,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.18181818181818182,
        "sum_squared": 0.03305785123966942,
        "min": 0.18181818181818182,
        "max": 0.18181818181818182,
        "mean": 0.18181818181818182,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.45454545454545453,
        "sum_squared": 0.20661157024793386,
        "min": 0.45454545454545453,
        "max": 0.45454545454545453,
        "mean": 0.45454545454545453,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.7424242424242423,
        "sum_squared": 0.5511937557392101,
        "min": 0.7424242424242423,
        "max": 0.7424242424242423,
        "mean": 0.7424242424242423,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.8072150072150073,
        "sum_squared": 0.6515960678731243,
        "min": 0.8072150072150073,
        "max": 0.8072150072150073,
        "mean": 0.8072150072150073,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.48668538918044474,
        "sum_squared": 0.23686266804172096,
        "min": 0.48668538918044474,
        "max": 0.48668538918044474,
        "mean": 0.48668538918044474,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.1253455257337641,
        "sum_squared": 0.015711500821473715,
        "min": 0.1253455257337641,
        "max": 0.1253455257337641,
        "mean": 0.1253455257337641,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 4.545454545454546,
        "sum_squared": 20.661157024793393,
        "min": 4.545454545454546,
        "max": 4.545454545454546,
        "mean": 4.545454545454546,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 16.727272727272727,
        "sum_squared": 279.80165289256195,
        "min": 16.727272727272727,
        "max": 16.727272727272727,
        "mean": 16.727272727272727,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3492.358974358974,
        "sum_squared": 12196571.205785666,
        "min": 3492.358974358974,
        "max": 3492.358974358974,
        "mean": 3492.358974358974,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3492.358974358974,
        "sum_squared": 12196571.205785666,
        "min": 3492.358974358974,
        "max": 3492.358974358974,
        "mean": 3492.358974358974,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.948717948717949,
        "sum_squared": 35.387245233399085,
        "min": 5.948717948717949,
        "max": 5.948717948717949,
        "mean": 5.948717948717949,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.948717948717949,
        "sum_squared": 35.387245233399085,
        "min": 5.948717948717949,
        "max": 5.948717948717949,
        "mean": 5.948717948717949,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.948717948717949,
        "sum_squared": 35.387245233399085,
        "min": 5.948717948717949,
        "max": 5.948717948717949,
        "mean": 5.948717948717949,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.948717948717949,
        "sum_squared": 35.387245233399085,
        "min": 5.948717948717949,
        "max": 5.948717948717949,
        "mean": 5.948717948717949,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.1927946286323743,
        "sum_squared": 1.4227590260942438,
        "min": 1.1927946286323743,
        "max": 1.1927946286323743,
        "mean": 1.1927946286323743,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.1927946286323743,
        "sum_squared": 1.4227590260942438,
        "min": 1.1927946286323743,
        "max": 1.1927946286323743,
        "mean": 1.1927946286323743,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.20512820512820512,
        "sum_squared": 0.042077580539118996,
        "min": 0.20512820512820512,
        "max": 0.20512820512820512,
        "mean": 0.20512820512820512,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.20512820512820512,
        "sum_squared": 0.042077580539118996,
        "min": 0.20512820512820512,
        "max": 0.20512820512820512,
        "mean": 0.20512820512820512,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5128205128205128,
        "sum_squared": 0.2629848783694937,
        "min": 0.5128205128205128,
        "max": 0.5128205128205128,
        "mean": 0.5128205128205128,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5128205128205128,
        "sum_squared": 0.2629848783694937,
        "min": 0.5128205128205128,
        "max": 0.5128205128205128,
        "mean": 0.5128205128205128,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.746199366787602,
        "sum_squared": 0.5568134949942182,
        "min": 0.746199366787602,
        "max": 0.746199366787602,
        "mean": 0.746199366787602,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.746199366787602,
        "sum_squared": 0.5568134949942182,
        "min": 0.746199366787602,
        "max": 0.746199366787602,
        "mean": 0.746199366787602,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7398372385610926,
        "sum_squared": 0.5473591395617031,
        "min": 0.7398372385610926,
        "max": 0.7398372385610926,
        "mean": 0.7398372385610926,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7398372385610926,
        "sum_squared": 0.5473591395617031,
        "min": 0.7398372385610926,
        "max": 0.7398372385610926,
        "mean": 0.7398372385610926,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.49287117061307545,
        "sum_squared": 0.24292199082150331,
        "min": 0.49287117061307545,
        "max": 0.49287117061307545,
        "mean": 0.49287117061307545,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.49287117061307545,
        "sum_squared": 0.24292199082150331,
        "min": 0.49287117061307545,
        "max": 0.49287117061307545,
        "mean": 0.49287117061307545,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.06879445340983803,
        "sum_squared": 0.004732676819958375,
        "min": 0.06879445340983803,
        "max": 0.06879445340983803,
        "mean": 0.06879445340983803,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.06879445340983803,
        "sum_squared": 0.004732676819958375,
        "min": 0.06879445340983803,
        "max": 0.06879445340983803,
        "mean": 0.06879445340983803,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.948717948717949,
        "sum_squared": 35.387245233399085,
        "min": 5.948717948717949,
        "max": 5.948717948717949,
        "mean": 5.948717948717949,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.948717948717949,
        "sum_squared": 35.387245233399085,
        "min": 5.948717948717949,
        "max": 5.948717948717949,
        "mean": 5.948717948717949,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 24.17948717948718,
        "sum_squared": 584.6476002629848,
        "min": 24.17948717948718,
        "max": 24.17948717948718,
        "mean": 24.17948717948718,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 24.17948717948718,
        "sum_squared": 584.6476002629848,
        "min": 24.17948717948718,
        "max": 24.17948717948718,
        "mean": 24.17948717948718,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3464.090909090909,
        "sum_squared": 11999925.82644628,
        "min": 3464.090909090909,
        "max": 3464.090909090909,
        "mean": 3464.090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3464.090909090909,
        "sum_squared": 11999925.82644628,
        "min": 3464.090909090909,
        "max": 3464.090909090909,
        "mean": 3464.090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.545454545454546,
        "sum_squared": 20.661157024793393,
        "min": 4.545454545454546,
        "max": 4.545454545454546,
        "mean": 4.545454545454546,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.545454545454546,
        "sum_squared": 20.661157024793393,
        "min": 4.545454545454546,
        "max": 4.545454545454546,
        "mean": 4.545454545454546,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.545454545454546,
        "sum_squared": 20.661157024793393,
        "min": 4.545454545454546,
        "max": 4.545454545454546,
        "mean": 4.545454545454546,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.545454545454546,
        "sum_squared": 20.661157024793393,
        "min": 4.545454545454546,
        "max": 4.545454545454546,
        "mean": 4.545454545454546,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.1362367976795544,
        "sum_squared": 1.2910340604010888,
        "min": 1.1362367976795544,
        "max": 1.1362367976795544,
        "mean": 1.1362367976795544,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.1362367976795544,
        "sum_squared": 1.2910340604010888,
        "min": 1.1362367976795544,
        "max": 1.1362367976795544,
        "mean": 1.1362367976795544,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.18181818181818182,
        "sum_squared": 0.03305785123966942,
        "min": 0.18181818181818182,
        "max": 0.18181818181818182,
        "mean": 0.18181818181818182,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.18181818181818182,
        "sum_squared": 0.03305785123966942,
        "min": 0.18181818181818182,
        "max": 0.18181818181818182,
        "mean": 0.18181818181818182,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.45454545454545453,
        "sum_squared": 0.20661157024793386,
        "min": 0.45454545454545453,
        "max": 0.45454545454545453,
        "mean": 0.45454545454545453,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.45454545454545453,
        "sum_squared": 0.20661157024793386,
        "min": 0.45454545454545453,
        "max": 0.45454545454545453,
        "mean": 0.45454545454545453,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7424242424242423,
        "sum_squared": 0.5511937557392101,
        "min": 0.7424242424242423,
        "max": 0.7424242424242423,
        "mean": 0.7424242424242423,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7424242424242423,
        "sum_squared": 0.5511937557392101,
        "min": 0.7424242424242423,
        "max": 0.7424242424242423,
        "mean": 0.7424242424242423,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8072150072150073,
        "sum_squared": 0.6515960678731243,
        "min": 0.8072150072150073,
        "max": 0.8072150072150073,
        "mean": 0.8072150072150073,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8072150072150073,
        "sum_squared": 0.6515960678731243,
        "min": 0.8072150072150073,
        "max": 0.8072150072150073,
        "mean": 0.8072150072150073,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48668538918044474,
        "sum_squared": 0.23686266804172096,
        "min": 0.48668538918044474,
        "max": 0.48668538918044474,
        "mean": 0.48668538918044474,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48668538918044474,
        "sum_squared": 0.23686266804172096,
        "min": 0.48668538918044474,
        "max": 0.48668538918044474,
        "mean": 0.48668538918044474,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.1253455257337641,
        "sum_squared": 0.015711500821473715,
        "min": 0.1253455257337641,
        "max": 0.1253455257337641,
        "mean": 0.1253455257337641,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.1253455257337641,
        "sum_squared": 0.015711500821473715,
        "min": 0.1253455257337641,
        "max": 0.1253455257337641,
        "mean": 0.1253455257337641,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.545454545454546,
        "sum_squared": 20.661157024793393,
        "min": 4.545454545454546,
        "max": 4.545454545454546,
        "mean": 4.545454545454546,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.545454545454546,
        "sum_squared": 20.661157024793393,
        "min": 4.545454545454546,
        "max": 4.545454545454546,
        "mean": 4.545454545454546,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 16.727272727272727,
        "sum_squared": 279.80165289256195,
        "min": 16.727272727272727,
        "max": 16.727272727272727,
        "mean": 16.727272727272727,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 16.727272727272727,
        "sum_squared": 279.80165289256195,
        "min": 16.727272727272727,
        "max": 16.727272727272727,
        "mean": 16.727272727272727,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 39.0,
        "sum_squared": 1521.0,
        "min": 39.0,
        "max": 39.0,
        "mean": 39.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 11.0,
        "sum_squared": 121.0,
        "min": 11.0,
        "max": 11.0,
        "mean": 11.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/narrative_qa:model=qwen_qwen1.5-7b,additional_instructions=narrative_qa",
    "run_spec": {
      "name": "narrative_qa:model=qwen_qwen1.5-7b,additional_instructions=narrative_qa",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.narrativeqa_scenario.NarrativeQAScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": " Answer only with a single letter corresponding to the correct option.\n",
        "input_prefix": "Passage: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/qwen1.5-7b",
        "model": "qwen/qwen1.5-7b",
        "temperature": 0.0,
        "max_tokens": 100,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score",
              "rouge_l",
              "bleu_1",
              "bleu_4"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "narrative_qa"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 3499.3846153846152,
        "sum_squared": 12245692.686390532,
        "min": 3499.3846153846152,
        "max": 3499.3846153846152,
        "mean": 3499.3846153846152,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9033157519805126,
        "sum_squared": 0.815979347776119,
        "min": 0.9033157519805126,
        "max": 0.9033157519805126,
        "mean": 0.9033157519805126,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.1282051282051282,
        "sum_squared": 0.016436554898093356,
        "min": 0.1282051282051282,
        "max": 0.1282051282051282,
        "mean": 0.1282051282051282,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test"
        },
        "count": 1,
        "sum": 0.20512820512820512,
        "sum_squared": 0.042077580539118996,
        "min": 0.20512820512820512,
        "max": 0.20512820512820512,
        "mean": 0.20512820512820512,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test"
        },
        "count": 1,
        "sum": 0.4246909225167372,
        "sum_squared": 0.18036237966811727,
        "min": 0.4246909225167372,
        "max": 0.4246909225167372,
        "mean": 0.4246909225167372,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test"
        },
        "count": 1,
        "sum": 0.41400741133231306,
        "sum_squared": 0.17140213663808307,
        "min": 0.41400741133231306,
        "max": 0.41400741133231306,
        "mean": 0.41400741133231306,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test"
        },
        "count": 1,
        "sum": 0.30990206698691997,
        "sum_squared": 0.09603929112276544,
        "min": 0.30990206698691997,
        "max": 0.30990206698691997,
        "mean": 0.30990206698691997,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test"
        },
        "count": 1,
        "sum": 0.04619031761888905,
        "sum_squared": 0.0021335454417338523,
        "min": 0.04619031761888905,
        "max": 0.04619031761888905,
        "mean": 0.04619031761888905,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 99.1025641025641,
        "sum_squared": 9821.318211702826,
        "min": 99.1025641025641,
        "max": 99.1025641025641,
        "mean": 99.1025641025641,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 3471.090909090909,
        "sum_squared": 12048472.099173553,
        "min": 3471.090909090909,
        "max": 3471.090909090909,
        "mean": 3471.090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.7938579429279674,
        "sum_squared": 0.630210433549824,
        "min": 0.7938579429279674,
        "max": 0.7938579429279674,
        "mean": 0.7938579429279674,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.09090909090909091,
        "sum_squared": 0.008264462809917356,
        "min": 0.09090909090909091,
        "max": 0.09090909090909091,
        "mean": 0.09090909090909091,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.09090909090909091,
        "sum_squared": 0.008264462809917356,
        "min": 0.09090909090909091,
        "max": 0.09090909090909091,
        "mean": 0.09090909090909091,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.49192588379191254,
        "sum_squared": 0.24199107514445425,
        "min": 0.49192588379191254,
        "max": 0.49192588379191254,
        "mean": 0.49192588379191254,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.5211235733963007,
        "sum_squared": 0.2715697787493296,
        "min": 0.5211235733963007,
        "max": 0.5211235733963007,
        "mean": 0.5211235733963007,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.37736085345876097,
        "sum_squared": 0.14240121372312448,
        "min": 0.37736085345876097,
        "max": 0.37736085345876097,
        "mean": 0.37736085345876097,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.03253588516746411,
        "sum_squared": 0.0010585838236304111,
        "min": 0.03253588516746411,
        "max": 0.03253588516746411,
        "mean": 0.03253588516746411,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 57.27272727272727,
        "sum_squared": 3280.1652892561983,
        "min": 57.27272727272727,
        "max": 57.27272727272727,
        "mean": 57.27272727272727,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3499.3846153846152,
        "sum_squared": 12245692.686390532,
        "min": 3499.3846153846152,
        "max": 3499.3846153846152,
        "mean": 3499.3846153846152,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3499.3846153846152,
        "sum_squared": 12245692.686390532,
        "min": 3499.3846153846152,
        "max": 3499.3846153846152,
        "mean": 3499.3846153846152,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9033157519805126,
        "sum_squared": 0.815979347776119,
        "min": 0.9033157519805126,
        "max": 0.9033157519805126,
        "mean": 0.9033157519805126,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9033157519805126,
        "sum_squared": 0.815979347776119,
        "min": 0.9033157519805126,
        "max": 0.9033157519805126,
        "mean": 0.9033157519805126,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.1282051282051282,
        "sum_squared": 0.016436554898093356,
        "min": 0.1282051282051282,
        "max": 0.1282051282051282,
        "mean": 0.1282051282051282,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.1282051282051282,
        "sum_squared": 0.016436554898093356,
        "min": 0.1282051282051282,
        "max": 0.1282051282051282,
        "mean": 0.1282051282051282,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.20512820512820512,
        "sum_squared": 0.042077580539118996,
        "min": 0.20512820512820512,
        "max": 0.20512820512820512,
        "mean": 0.20512820512820512,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.20512820512820512,
        "sum_squared": 0.042077580539118996,
        "min": 0.20512820512820512,
        "max": 0.20512820512820512,
        "mean": 0.20512820512820512,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4246909225167372,
        "sum_squared": 0.18036237966811727,
        "min": 0.4246909225167372,
        "max": 0.4246909225167372,
        "mean": 0.4246909225167372,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4246909225167372,
        "sum_squared": 0.18036237966811727,
        "min": 0.4246909225167372,
        "max": 0.4246909225167372,
        "mean": 0.4246909225167372,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41400741133231306,
        "sum_squared": 0.17140213663808307,
        "min": 0.41400741133231306,
        "max": 0.41400741133231306,
        "mean": 0.41400741133231306,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.41400741133231306,
        "sum_squared": 0.17140213663808307,
        "min": 0.41400741133231306,
        "max": 0.41400741133231306,
        "mean": 0.41400741133231306,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.30990206698691997,
        "sum_squared": 0.09603929112276544,
        "min": 0.30990206698691997,
        "max": 0.30990206698691997,
        "mean": 0.30990206698691997,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.30990206698691997,
        "sum_squared": 0.09603929112276544,
        "min": 0.30990206698691997,
        "max": 0.30990206698691997,
        "mean": 0.30990206698691997,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.04619031761888905,
        "sum_squared": 0.0021335454417338523,
        "min": 0.04619031761888905,
        "max": 0.04619031761888905,
        "mean": 0.04619031761888905,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.04619031761888905,
        "sum_squared": 0.0021335454417338523,
        "min": 0.04619031761888905,
        "max": 0.04619031761888905,
        "mean": 0.04619031761888905,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 99.1025641025641,
        "sum_squared": 9821.318211702826,
        "min": 99.1025641025641,
        "max": 99.1025641025641,
        "mean": 99.1025641025641,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 99.1025641025641,
        "sum_squared": 9821.318211702826,
        "min": 99.1025641025641,
        "max": 99.1025641025641,
        "mean": 99.1025641025641,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0,
        "sum_squared": 4.0,
        "min": 2.0,
        "max": 2.0,
        "mean": 2.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3471.090909090909,
        "sum_squared": 12048472.099173553,
        "min": 3471.090909090909,
        "max": 3471.090909090909,
        "mean": 3471.090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3471.090909090909,
        "sum_squared": 12048472.099173553,
        "min": 3471.090909090909,
        "max": 3471.090909090909,
        "mean": 3471.090909090909,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7938579429279674,
        "sum_squared": 0.630210433549824,
        "min": 0.7938579429279674,
        "max": 0.7938579429279674,
        "mean": 0.7938579429279674,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7938579429279674,
        "sum_squared": 0.630210433549824,
        "min": 0.7938579429279674,
        "max": 0.7938579429279674,
        "mean": 0.7938579429279674,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.09090909090909091,
        "sum_squared": 0.008264462809917356,
        "min": 0.09090909090909091,
        "max": 0.09090909090909091,
        "mean": 0.09090909090909091,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.09090909090909091,
        "sum_squared": 0.008264462809917356,
        "min": 0.09090909090909091,
        "max": 0.09090909090909091,
        "mean": 0.09090909090909091,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.09090909090909091,
        "sum_squared": 0.008264462809917356,
        "min": 0.09090909090909091,
        "max": 0.09090909090909091,
        "mean": 0.09090909090909091,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.09090909090909091,
        "sum_squared": 0.008264462809917356,
        "min": 0.09090909090909091,
        "max": 0.09090909090909091,
        "mean": 0.09090909090909091,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.49192588379191254,
        "sum_squared": 0.24199107514445425,
        "min": 0.49192588379191254,
        "max": 0.49192588379191254,
        "mean": 0.49192588379191254,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.49192588379191254,
        "sum_squared": 0.24199107514445425,
        "min": 0.49192588379191254,
        "max": 0.49192588379191254,
        "mean": 0.49192588379191254,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5211235733963007,
        "sum_squared": 0.2715697787493296,
        "min": 0.5211235733963007,
        "max": 0.5211235733963007,
        "mean": 0.5211235733963007,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "rouge_l",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5211235733963007,
        "sum_squared": 0.2715697787493296,
        "min": 0.5211235733963007,
        "max": 0.5211235733963007,
        "mean": 0.5211235733963007,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.37736085345876097,
        "sum_squared": 0.14240121372312448,
        "min": 0.37736085345876097,
        "max": 0.37736085345876097,
        "mean": 0.37736085345876097,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_1",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.37736085345876097,
        "sum_squared": 0.14240121372312448,
        "min": 0.37736085345876097,
        "max": 0.37736085345876097,
        "mean": 0.37736085345876097,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.03253588516746411,
        "sum_squared": 0.0010585838236304111,
        "min": 0.03253588516746411,
        "max": 0.03253588516746411,
        "mean": 0.03253588516746411,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bleu_4",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.03253588516746411,
        "sum_squared": 0.0010585838236304111,
        "min": 0.03253588516746411,
        "max": 0.03253588516746411,
        "mean": 0.03253588516746411,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 57.27272727272727,
        "sum_squared": 3280.1652892561983,
        "min": 57.27272727272727,
        "max": 57.27272727272727,
        "mean": 57.27272727272727,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 57.27272727272727,
        "sum_squared": 3280.1652892561983,
        "min": 57.27272727272727,
        "max": 57.27272727272727,
        "mean": 57.27272727272727,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 39.0,
        "sum_squared": 1521.0,
        "min": 39.0,
        "max": 39.0,
        "mean": 39.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 11.0,
        "sum_squared": 121.0,
        "min": 11.0,
        "max": 11.0,
        "mean": 11.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=closedbook,model=01-ai_yi-6b,additional_instructions=natural_qa_closedbook",
    "run_spec": {
      "name": "natural_qa:mode=closedbook,model=01-ai_yi-6b,additional_instructions=natural_qa_closedbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "closedbook"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/yi-6b",
        "model": "01-ai/yi-6b",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_closedbook"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 155.4,
        "sum_squared": 24149.160000000003,
        "min": 155.4,
        "max": 155.4,
        "mean": 155.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.36411533832550047,
        "sum_squared": 0.13257997960389367,
        "min": 0.36411533832550047,
        "max": 0.36411533832550047,
        "mean": 0.36411533832550047,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.2,
        "sum_squared": 0.04000000000000001,
        "min": 0.2,
        "max": 0.2,
        "mean": 0.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.2,
        "sum_squared": 0.04000000000000001,
        "min": 0.2,
        "max": 0.2,
        "mean": 0.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.3010266955266955,
        "sum_squared": 0.09061707141972182,
        "min": 0.3010266955266955,
        "max": 0.3010266955266955,
        "mean": 0.3010266955266955,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 12.28,
        "sum_squared": 150.7984,
        "min": 12.28,
        "max": 12.28,
        "mean": 12.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 155.4,
        "sum_squared": 24149.160000000003,
        "min": 155.4,
        "max": 155.4,
        "mean": 155.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 155.4,
        "sum_squared": 24149.160000000003,
        "min": 155.4,
        "max": 155.4,
        "mean": 155.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.36411533832550047,
        "sum_squared": 0.13257997960389367,
        "min": 0.36411533832550047,
        "max": 0.36411533832550047,
        "mean": 0.36411533832550047,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.36411533832550047,
        "sum_squared": 0.13257997960389367,
        "min": 0.36411533832550047,
        "max": 0.36411533832550047,
        "mean": 0.36411533832550047,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2,
        "sum_squared": 0.04000000000000001,
        "min": 0.2,
        "max": 0.2,
        "mean": 0.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2,
        "sum_squared": 0.04000000000000001,
        "min": 0.2,
        "max": 0.2,
        "mean": 0.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2,
        "sum_squared": 0.04000000000000001,
        "min": 0.2,
        "max": 0.2,
        "mean": 0.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2,
        "sum_squared": 0.04000000000000001,
        "min": 0.2,
        "max": 0.2,
        "mean": 0.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3010266955266955,
        "sum_squared": 0.09061707141972182,
        "min": 0.3010266955266955,
        "max": 0.3010266955266955,
        "mean": 0.3010266955266955,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3010266955266955,
        "sum_squared": 0.09061707141972182,
        "min": 0.3010266955266955,
        "max": 0.3010266955266955,
        "mean": 0.3010266955266955,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 12.28,
        "sum_squared": 150.7984,
        "min": 12.28,
        "max": 12.28,
        "mean": 12.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 12.28,
        "sum_squared": 150.7984,
        "min": 12.28,
        "max": 12.28,
        "mean": 12.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=closedbook,model=anthropic_claude-2.1,additional_instructions=natural_qa_closedbook",
    "run_spec": {
      "name": "natural_qa:mode=closedbook,model=anthropic_claude-2.1,additional_instructions=natural_qa_closedbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "closedbook"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "\n\nHuman:",
        "global_suffix": "\n\nAssistant:",
        "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-2.1",
        "model": "anthropic/claude-2.1",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_closedbook"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 147.04,
        "sum_squared": 21620.761599999998,
        "min": 147.04,
        "max": 147.04,
        "mean": 147.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 80.72,
        "sum_squared": 6515.7184,
        "min": 80.72,
        "max": 80.72,
        "mean": 80.72,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 80.72,
        "sum_squared": 6515.7184,
        "min": 80.72,
        "max": 80.72,
        "mean": 80.72,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 4.001677985191345,
        "sum_squared": 16.013426697165066,
        "min": 4.001677985191345,
        "max": 4.001677985191345,
        "mean": 4.001677985191345,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.03844863496227748,
        "sum_squared": 0.0014782975304624664,
        "min": 0.03844863496227748,
        "max": 0.03844863496227748,
        "mean": 0.03844863496227748,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 80.72,
        "sum_squared": 6515.7184,
        "min": 80.72,
        "max": 80.72,
        "mean": 80.72,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 413.26,
        "sum_squared": 170783.8276,
        "min": 413.26,
        "max": 413.26,
        "mean": 413.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 147.04,
        "sum_squared": 21620.761599999998,
        "min": 147.04,
        "max": 147.04,
        "mean": 147.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 147.04,
        "sum_squared": 21620.761599999998,
        "min": 147.04,
        "max": 147.04,
        "mean": 147.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 80.72,
        "sum_squared": 6515.7184,
        "min": 80.72,
        "max": 80.72,
        "mean": 80.72,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 80.72,
        "sum_squared": 6515.7184,
        "min": 80.72,
        "max": 80.72,
        "mean": 80.72,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 80.72,
        "sum_squared": 6515.7184,
        "min": 80.72,
        "max": 80.72,
        "mean": 80.72,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 80.72,
        "sum_squared": 6515.7184,
        "min": 80.72,
        "max": 80.72,
        "mean": 80.72,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.001677985191345,
        "sum_squared": 16.013426697165066,
        "min": 4.001677985191345,
        "max": 4.001677985191345,
        "mean": 4.001677985191345,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.001677985191345,
        "sum_squared": 16.013426697165066,
        "min": 4.001677985191345,
        "max": 4.001677985191345,
        "mean": 4.001677985191345,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.03844863496227748,
        "sum_squared": 0.0014782975304624664,
        "min": 0.03844863496227748,
        "max": 0.03844863496227748,
        "mean": 0.03844863496227748,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.03844863496227748,
        "sum_squared": 0.0014782975304624664,
        "min": 0.03844863496227748,
        "max": 0.03844863496227748,
        "mean": 0.03844863496227748,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 80.72,
        "sum_squared": 6515.7184,
        "min": 80.72,
        "max": 80.72,
        "mean": 80.72,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 80.72,
        "sum_squared": 6515.7184,
        "min": 80.72,
        "max": 80.72,
        "mean": 80.72,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 413.26,
        "sum_squared": 170783.8276,
        "min": 413.26,
        "max": 413.26,
        "mean": 413.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 413.26,
        "sum_squared": 170783.8276,
        "min": 413.26,
        "max": 413.26,
        "mean": 413.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=closedbook,model=anthropic_claude-instant-1.2,additional_instructions=natural_qa_closedbook",
    "run_spec": {
      "name": "natural_qa:mode=closedbook,model=anthropic_claude-instant-1.2,additional_instructions=natural_qa_closedbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "closedbook"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "\n\nHuman:",
        "global_suffix": "\n\nAssistant:",
        "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-instant-1.2",
        "model": "anthropic/claude-instant-1.2",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_closedbook"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 147.04,
        "sum_squared": 21620.761599999998,
        "min": 147.04,
        "max": 147.04,
        "mean": 147.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.38,
        "sum_squared": 28.944399999999998,
        "min": 5.38,
        "max": 5.38,
        "mean": 5.38,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.38,
        "sum_squared": 28.944399999999998,
        "min": 5.38,
        "max": 5.38,
        "mean": 5.38,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.9007561206817627,
        "sum_squared": 0.8113615889456582,
        "min": 0.9007561206817627,
        "max": 0.9007561206817627,
        "mean": 0.9007561206817627,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.16,
        "sum_squared": 0.0256,
        "min": 0.16,
        "max": 0.16,
        "mean": 0.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.16,
        "sum_squared": 0.0256,
        "min": 0.16,
        "max": 0.16,
        "mean": 0.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.2688773448773449,
        "sum_squared": 0.07229502658829066,
        "min": 0.2688773448773449,
        "max": 0.2688773448773449,
        "mean": 0.2688773448773449,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.38,
        "sum_squared": 28.944399999999998,
        "min": 5.38,
        "max": 5.38,
        "mean": 5.38,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 23.02,
        "sum_squared": 529.9204,
        "min": 23.02,
        "max": 23.02,
        "mean": 23.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 147.04,
        "sum_squared": 21620.761599999998,
        "min": 147.04,
        "max": 147.04,
        "mean": 147.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 147.04,
        "sum_squared": 21620.761599999998,
        "min": 147.04,
        "max": 147.04,
        "mean": 147.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.38,
        "sum_squared": 28.944399999999998,
        "min": 5.38,
        "max": 5.38,
        "mean": 5.38,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.38,
        "sum_squared": 28.944399999999998,
        "min": 5.38,
        "max": 5.38,
        "mean": 5.38,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.38,
        "sum_squared": 28.944399999999998,
        "min": 5.38,
        "max": 5.38,
        "mean": 5.38,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.38,
        "sum_squared": 28.944399999999998,
        "min": 5.38,
        "max": 5.38,
        "mean": 5.38,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9007561206817627,
        "sum_squared": 0.8113615889456582,
        "min": 0.9007561206817627,
        "max": 0.9007561206817627,
        "mean": 0.9007561206817627,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9007561206817627,
        "sum_squared": 0.8113615889456582,
        "min": 0.9007561206817627,
        "max": 0.9007561206817627,
        "mean": 0.9007561206817627,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.16,
        "sum_squared": 0.0256,
        "min": 0.16,
        "max": 0.16,
        "mean": 0.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.16,
        "sum_squared": 0.0256,
        "min": 0.16,
        "max": 0.16,
        "mean": 0.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.16,
        "sum_squared": 0.0256,
        "min": 0.16,
        "max": 0.16,
        "mean": 0.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.16,
        "sum_squared": 0.0256,
        "min": 0.16,
        "max": 0.16,
        "mean": 0.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2688773448773449,
        "sum_squared": 0.07229502658829066,
        "min": 0.2688773448773449,
        "max": 0.2688773448773449,
        "mean": 0.2688773448773449,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2688773448773449,
        "sum_squared": 0.07229502658829066,
        "min": 0.2688773448773449,
        "max": 0.2688773448773449,
        "mean": 0.2688773448773449,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.38,
        "sum_squared": 28.944399999999998,
        "min": 5.38,
        "max": 5.38,
        "mean": 5.38,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.38,
        "sum_squared": 28.944399999999998,
        "min": 5.38,
        "max": 5.38,
        "mean": 5.38,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 23.02,
        "sum_squared": 529.9204,
        "min": 23.02,
        "max": 23.02,
        "mean": 23.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 23.02,
        "sum_squared": 529.9204,
        "min": 23.02,
        "max": 23.02,
        "mean": 23.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=closedbook,model=google_gemma-7b,additional_instructions=natural_qa_closedbook",
    "run_spec": {
      "name": "natural_qa:mode=closedbook,model=google_gemma-7b,additional_instructions=natural_qa_closedbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "closedbook"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/gemma-7b",
        "model": "google/gemma-7b",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_closedbook"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 148.74,
        "sum_squared": 22123.587600000003,
        "min": 148.74,
        "max": 148.74,
        "mean": 148.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.32443345069885254,
        "sum_squared": 0.10525706393236478,
        "min": 0.32443345069885254,
        "max": 0.32443345069885254,
        "mean": 0.32443345069885254,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.06,
        "sum_squared": 0.0036,
        "min": 0.06,
        "max": 0.06,
        "mean": 0.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.08,
        "sum_squared": 0.0064,
        "min": 0.08,
        "max": 0.08,
        "mean": 0.08,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.17862049062049062,
        "sum_squared": 0.03190527966950478,
        "min": 0.17862049062049062,
        "max": 0.17862049062049062,
        "mean": 0.17862049062049062,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 13.82,
        "sum_squared": 190.9924,
        "min": 13.82,
        "max": 13.82,
        "mean": 13.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 148.74,
        "sum_squared": 22123.587600000003,
        "min": 148.74,
        "max": 148.74,
        "mean": 148.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 148.74,
        "sum_squared": 22123.587600000003,
        "min": 148.74,
        "max": 148.74,
        "mean": 148.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32443345069885254,
        "sum_squared": 0.10525706393236478,
        "min": 0.32443345069885254,
        "max": 0.32443345069885254,
        "mean": 0.32443345069885254,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.32443345069885254,
        "sum_squared": 0.10525706393236478,
        "min": 0.32443345069885254,
        "max": 0.32443345069885254,
        "mean": 0.32443345069885254,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.06,
        "sum_squared": 0.0036,
        "min": 0.06,
        "max": 0.06,
        "mean": 0.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.06,
        "sum_squared": 0.0036,
        "min": 0.06,
        "max": 0.06,
        "mean": 0.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.08,
        "sum_squared": 0.0064,
        "min": 0.08,
        "max": 0.08,
        "mean": 0.08,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.08,
        "sum_squared": 0.0064,
        "min": 0.08,
        "max": 0.08,
        "mean": 0.08,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.17862049062049062,
        "sum_squared": 0.03190527966950478,
        "min": 0.17862049062049062,
        "max": 0.17862049062049062,
        "mean": 0.17862049062049062,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.17862049062049062,
        "sum_squared": 0.03190527966950478,
        "min": 0.17862049062049062,
        "max": 0.17862049062049062,
        "mean": 0.17862049062049062,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 13.82,
        "sum_squared": 190.9924,
        "min": 13.82,
        "max": 13.82,
        "mean": 13.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 13.82,
        "sum_squared": 190.9924,
        "min": 13.82,
        "max": 13.82,
        "mean": 13.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=closedbook,model=google_gemma-7b-it,additional_instructions=natural_qa_closedbook",
    "run_spec": {
      "name": "natural_qa:mode=closedbook,model=google_gemma-7b-it,additional_instructions=natural_qa_closedbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "closedbook"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/gemma-7b-it",
        "model": "google/gemma-7b-it",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_closedbook"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 148.74,
        "sum_squared": 22123.587600000003,
        "min": 148.74,
        "max": 148.74,
        "mean": 148.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.31996378898620603,
        "sum_squared": 0.10237682626240938,
        "min": 0.31996378898620603,
        "max": 0.31996378898620603,
        "mean": 0.31996378898620603,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.16,
        "sum_squared": 26.625600000000002,
        "min": 5.16,
        "max": 5.16,
        "mean": 5.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 148.74,
        "sum_squared": 22123.587600000003,
        "min": 148.74,
        "max": 148.74,
        "mean": 148.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 148.74,
        "sum_squared": 22123.587600000003,
        "min": 148.74,
        "max": 148.74,
        "mean": 148.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.31996378898620603,
        "sum_squared": 0.10237682626240938,
        "min": 0.31996378898620603,
        "max": 0.31996378898620603,
        "mean": 0.31996378898620603,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.31996378898620603,
        "sum_squared": 0.10237682626240938,
        "min": 0.31996378898620603,
        "max": 0.31996378898620603,
        "mean": 0.31996378898620603,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.16,
        "sum_squared": 26.625600000000002,
        "min": 5.16,
        "max": 5.16,
        "mean": 5.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.16,
        "sum_squared": 26.625600000000002,
        "min": 5.16,
        "max": 5.16,
        "mean": 5.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=closedbook,model=google_text-bison@001,additional_instructions=natural_qa_closedbook",
    "run_spec": {
      "name": "natural_qa:mode=closedbook,model=google_text-bison@001,additional_instructions=natural_qa_closedbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "closedbook"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/text-bison@001",
        "model": "google/text-bison@001",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_closedbook"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 155.96,
        "sum_squared": 24323.521600000004,
        "min": 155.96,
        "max": 155.96,
        "mean": 155.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.6457538318634033,
        "sum_squared": 0.4169980113662685,
        "min": 0.6457538318634033,
        "max": 0.6457538318634033,
        "mean": 0.6457538318634033,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.22,
        "sum_squared": 0.0484,
        "min": 0.22,
        "max": 0.22,
        "mean": 0.22,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.22,
        "sum_squared": 0.0484,
        "min": 0.22,
        "max": 0.22,
        "mean": 0.22,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.2854285714285714,
        "sum_squared": 0.0814694693877551,
        "min": 0.2854285714285714,
        "max": 0.2854285714285714,
        "mean": 0.2854285714285714,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 155.96,
        "sum_squared": 24323.521600000004,
        "min": 155.96,
        "max": 155.96,
        "mean": 155.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 155.96,
        "sum_squared": 24323.521600000004,
        "min": 155.96,
        "max": 155.96,
        "mean": 155.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6457538318634033,
        "sum_squared": 0.4169980113662685,
        "min": 0.6457538318634033,
        "max": 0.6457538318634033,
        "mean": 0.6457538318634033,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6457538318634033,
        "sum_squared": 0.4169980113662685,
        "min": 0.6457538318634033,
        "max": 0.6457538318634033,
        "mean": 0.6457538318634033,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.22,
        "sum_squared": 0.0484,
        "min": 0.22,
        "max": 0.22,
        "mean": 0.22,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.22,
        "sum_squared": 0.0484,
        "min": 0.22,
        "max": 0.22,
        "mean": 0.22,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.22,
        "sum_squared": 0.0484,
        "min": 0.22,
        "max": 0.22,
        "mean": 0.22,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.22,
        "sum_squared": 0.0484,
        "min": 0.22,
        "max": 0.22,
        "mean": 0.22,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2854285714285714,
        "sum_squared": 0.0814694693877551,
        "min": 0.2854285714285714,
        "max": 0.2854285714285714,
        "mean": 0.2854285714285714,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2854285714285714,
        "sum_squared": 0.0814694693877551,
        "min": 0.2854285714285714,
        "max": 0.2854285714285714,
        "mean": 0.2854285714285714,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=closedbook,model=google_text-unicorn@001,additional_instructions=natural_qa_closedbook",
    "run_spec": {
      "name": "natural_qa:mode=closedbook,model=google_text-unicorn@001,additional_instructions=natural_qa_closedbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "closedbook"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/text-unicorn@001",
        "model": "google/text-unicorn@001",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_closedbook"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 155.96,
        "sum_squared": 24323.521600000004,
        "min": 155.96,
        "max": 155.96,
        "mean": 155.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.2489180326461793,
        "sum_squared": 1.559796252268803,
        "min": 1.2489180326461793,
        "max": 1.2489180326461793,
        "mean": 1.2489180326461793,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.26,
        "sum_squared": 0.06760000000000001,
        "min": 0.26,
        "max": 0.26,
        "mean": 0.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.4975246583481879,
        "sum_squared": 0.24753078566448108,
        "min": 0.4975246583481879,
        "max": 0.4975246583481879,
        "mean": 0.4975246583481879,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 155.96,
        "sum_squared": 24323.521600000004,
        "min": 155.96,
        "max": 155.96,
        "mean": 155.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 155.96,
        "sum_squared": 24323.521600000004,
        "min": 155.96,
        "max": 155.96,
        "mean": 155.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.2489180326461793,
        "sum_squared": 1.559796252268803,
        "min": 1.2489180326461793,
        "max": 1.2489180326461793,
        "mean": 1.2489180326461793,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.2489180326461793,
        "sum_squared": 1.559796252268803,
        "min": 1.2489180326461793,
        "max": 1.2489180326461793,
        "mean": 1.2489180326461793,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.26,
        "sum_squared": 0.06760000000000001,
        "min": 0.26,
        "max": 0.26,
        "mean": 0.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.26,
        "sum_squared": 0.06760000000000001,
        "min": 0.26,
        "max": 0.26,
        "mean": 0.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4975246583481879,
        "sum_squared": 0.24753078566448108,
        "min": 0.4975246583481879,
        "max": 0.4975246583481879,
        "mean": 0.4975246583481879,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4975246583481879,
        "sum_squared": 0.24753078566448108,
        "min": 0.4975246583481879,
        "max": 0.4975246583481879,
        "mean": 0.4975246583481879,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=closedbook,model=meta_llama-2-7b,additional_instructions=natural_qa_closedbook",
    "run_spec": {
      "name": "natural_qa:mode=closedbook,model=meta_llama-2-7b,additional_instructions=natural_qa_closedbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "closedbook"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-2-7b",
        "model": "meta/llama-2-7b",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_closedbook"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 161.0,
        "sum_squared": 25921.0,
        "min": 161.0,
        "max": 161.0,
        "mean": 161.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.3591157531738281,
        "sum_squared": 0.12896412417760583,
        "min": 0.3591157531738281,
        "max": 0.3591157531738281,
        "mean": 0.3591157531738281,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.14,
        "sum_squared": 0.019600000000000003,
        "min": 0.14,
        "max": 0.14,
        "mean": 0.14,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.18,
        "sum_squared": 0.0324,
        "min": 0.18,
        "max": 0.18,
        "mean": 0.18,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.29117107385218843,
        "sum_squared": 0.08478059424823657,
        "min": 0.29117107385218843,
        "max": 0.29117107385218843,
        "mean": 0.29117107385218843,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 12.64,
        "sum_squared": 159.76960000000003,
        "min": 12.64,
        "max": 12.64,
        "mean": 12.64,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 161.0,
        "sum_squared": 25921.0,
        "min": 161.0,
        "max": 161.0,
        "mean": 161.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 161.0,
        "sum_squared": 25921.0,
        "min": 161.0,
        "max": 161.0,
        "mean": 161.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3591157531738281,
        "sum_squared": 0.12896412417760583,
        "min": 0.3591157531738281,
        "max": 0.3591157531738281,
        "mean": 0.3591157531738281,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3591157531738281,
        "sum_squared": 0.12896412417760583,
        "min": 0.3591157531738281,
        "max": 0.3591157531738281,
        "mean": 0.3591157531738281,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.14,
        "sum_squared": 0.019600000000000003,
        "min": 0.14,
        "max": 0.14,
        "mean": 0.14,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.14,
        "sum_squared": 0.019600000000000003,
        "min": 0.14,
        "max": 0.14,
        "mean": 0.14,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.18,
        "sum_squared": 0.0324,
        "min": 0.18,
        "max": 0.18,
        "mean": 0.18,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.18,
        "sum_squared": 0.0324,
        "min": 0.18,
        "max": 0.18,
        "mean": 0.18,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29117107385218843,
        "sum_squared": 0.08478059424823657,
        "min": 0.29117107385218843,
        "max": 0.29117107385218843,
        "mean": 0.29117107385218843,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29117107385218843,
        "sum_squared": 0.08478059424823657,
        "min": 0.29117107385218843,
        "max": 0.29117107385218843,
        "mean": 0.29117107385218843,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 12.64,
        "sum_squared": 159.76960000000003,
        "min": 12.64,
        "max": 12.64,
        "mean": 12.64,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 12.64,
        "sum_squared": 159.76960000000003,
        "min": 12.64,
        "max": 12.64,
        "mean": 12.64,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=closedbook,model=microsoft_phi-2,additional_instructions=natural_qa_closedbook",
    "run_spec": {
      "name": "natural_qa:mode=closedbook,model=microsoft_phi-2,additional_instructions=natural_qa_closedbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "closedbook"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/phi-2",
        "model": "microsoft/phi-2",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_closedbook"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 139.98,
        "sum_squared": 19594.4004,
        "min": 139.98,
        "max": 139.98,
        "mean": 139.98,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.2961044549942017,
        "sum_squared": 0.0876778482674132,
        "min": 0.2961044549942017,
        "max": 0.2961044549942017,
        "mean": 0.2961044549942017,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.02,
        "sum_squared": 0.0004,
        "min": 0.02,
        "max": 0.02,
        "mean": 0.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.02,
        "sum_squared": 0.0004,
        "min": 0.02,
        "max": 0.02,
        "mean": 0.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.09383333333333335,
        "sum_squared": 0.008804694444444448,
        "min": 0.09383333333333335,
        "max": 0.09383333333333335,
        "mean": 0.09383333333333335,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 18.66,
        "sum_squared": 348.1956,
        "min": 18.66,
        "max": 18.66,
        "mean": 18.66,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 139.98,
        "sum_squared": 19594.4004,
        "min": 139.98,
        "max": 139.98,
        "mean": 139.98,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 139.98,
        "sum_squared": 19594.4004,
        "min": 139.98,
        "max": 139.98,
        "mean": 139.98,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2961044549942017,
        "sum_squared": 0.0876778482674132,
        "min": 0.2961044549942017,
        "max": 0.2961044549942017,
        "mean": 0.2961044549942017,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.2961044549942017,
        "sum_squared": 0.0876778482674132,
        "min": 0.2961044549942017,
        "max": 0.2961044549942017,
        "mean": 0.2961044549942017,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.02,
        "sum_squared": 0.0004,
        "min": 0.02,
        "max": 0.02,
        "mean": 0.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.02,
        "sum_squared": 0.0004,
        "min": 0.02,
        "max": 0.02,
        "mean": 0.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.02,
        "sum_squared": 0.0004,
        "min": 0.02,
        "max": 0.02,
        "mean": 0.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.02,
        "sum_squared": 0.0004,
        "min": 0.02,
        "max": 0.02,
        "mean": 0.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.09383333333333335,
        "sum_squared": 0.008804694444444448,
        "min": 0.09383333333333335,
        "max": 0.09383333333333335,
        "mean": 0.09383333333333335,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.09383333333333335,
        "sum_squared": 0.008804694444444448,
        "min": 0.09383333333333335,
        "max": 0.09383333333333335,
        "mean": 0.09383333333333335,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 18.66,
        "sum_squared": 348.1956,
        "min": 18.66,
        "max": 18.66,
        "mean": 18.66,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 18.66,
        "sum_squared": 348.1956,
        "min": 18.66,
        "max": 18.66,
        "mean": 18.66,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=closedbook,model=mistralai_mixtral-8x7b-32kseqlen,additional_instructions=natural_qa_closedbook",
    "run_spec": {
      "name": "natural_qa:mode=closedbook,model=mistralai_mixtral-8x7b-32kseqlen,additional_instructions=natural_qa_closedbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "closedbook"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/mixtral-8x7b-32kseqlen",
        "model": "mistralai/mixtral-8x7b-32kseqlen",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_closedbook"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 165.78,
        "sum_squared": 27483.0084,
        "min": 165.78,
        "max": 165.78,
        "mean": 165.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.4726958465576172,
        "sum_squared": 0.2234413633528224,
        "min": 0.4726958465576172,
        "max": 0.4726958465576172,
        "mean": 0.4726958465576172,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.26,
        "sum_squared": 0.06760000000000001,
        "min": 0.26,
        "max": 0.26,
        "mean": 0.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.4934239094239094,
        "sum_squared": 0.24346715439117436,
        "min": 0.4934239094239094,
        "max": 0.4934239094239094,
        "mean": 0.4934239094239094,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 20.84,
        "sum_squared": 434.30559999999997,
        "min": 20.84,
        "max": 20.84,
        "mean": 20.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 165.78,
        "sum_squared": 27483.0084,
        "min": 165.78,
        "max": 165.78,
        "mean": 165.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 165.78,
        "sum_squared": 27483.0084,
        "min": 165.78,
        "max": 165.78,
        "mean": 165.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4726958465576172,
        "sum_squared": 0.2234413633528224,
        "min": 0.4726958465576172,
        "max": 0.4726958465576172,
        "mean": 0.4726958465576172,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4726958465576172,
        "sum_squared": 0.2234413633528224,
        "min": 0.4726958465576172,
        "max": 0.4726958465576172,
        "mean": 0.4726958465576172,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.26,
        "sum_squared": 0.06760000000000001,
        "min": 0.26,
        "max": 0.26,
        "mean": 0.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.26,
        "sum_squared": 0.06760000000000001,
        "min": 0.26,
        "max": 0.26,
        "mean": 0.26,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4934239094239094,
        "sum_squared": 0.24346715439117436,
        "min": 0.4934239094239094,
        "max": 0.4934239094239094,
        "mean": 0.4934239094239094,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4934239094239094,
        "sum_squared": 0.24346715439117436,
        "min": 0.4934239094239094,
        "max": 0.4934239094239094,
        "mean": 0.4934239094239094,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 20.84,
        "sum_squared": 434.30559999999997,
        "min": 20.84,
        "max": 20.84,
        "mean": 20.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 20.84,
        "sum_squared": 434.30559999999997,
        "min": 20.84,
        "max": 20.84,
        "mean": 20.84,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=closedbook,model=openai_gpt-3.5-turbo-0613,additional_instructions=natural_qa_closedbook",
    "run_spec": {
      "name": "natural_qa:mode=closedbook,model=openai_gpt-3.5-turbo-0613,additional_instructions=natural_qa_closedbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "closedbook"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-3.5-turbo-0613",
        "model": "openai/gpt-3.5-turbo-0613",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_closedbook"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 133.92,
        "sum_squared": 17934.566399999996,
        "min": 133.92,
        "max": 133.92,
        "mean": 133.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.58,
        "sum_squared": 31.136400000000002,
        "min": 5.58,
        "max": 5.58,
        "mean": 5.58,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.58,
        "sum_squared": 31.136400000000002,
        "min": 5.58,
        "max": 5.58,
        "mean": 5.58,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.42722376346588137,
        "sum_squared": 0.18252014406995135,
        "min": 0.42722376346588137,
        "max": 0.42722376346588137,
        "mean": 0.42722376346588137,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.16,
        "sum_squared": 0.0256,
        "min": 0.16,
        "max": 0.16,
        "mean": 0.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.18,
        "sum_squared": 0.0324,
        "min": 0.18,
        "max": 0.18,
        "mean": 0.18,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.3280874914559125,
        "sum_squared": 0.10764140204983344,
        "min": 0.3280874914559125,
        "max": 0.3280874914559125,
        "mean": 0.3280874914559125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.58,
        "sum_squared": 31.136400000000002,
        "min": 5.58,
        "max": 5.58,
        "mean": 5.58,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 25.04,
        "sum_squared": 627.0015999999999,
        "min": 25.04,
        "max": 25.04,
        "mean": 25.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 133.92,
        "sum_squared": 17934.566399999996,
        "min": 133.92,
        "max": 133.92,
        "mean": 133.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 133.92,
        "sum_squared": 17934.566399999996,
        "min": 133.92,
        "max": 133.92,
        "mean": 133.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.58,
        "sum_squared": 31.136400000000002,
        "min": 5.58,
        "max": 5.58,
        "mean": 5.58,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.58,
        "sum_squared": 31.136400000000002,
        "min": 5.58,
        "max": 5.58,
        "mean": 5.58,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.58,
        "sum_squared": 31.136400000000002,
        "min": 5.58,
        "max": 5.58,
        "mean": 5.58,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.58,
        "sum_squared": 31.136400000000002,
        "min": 5.58,
        "max": 5.58,
        "mean": 5.58,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.42722376346588137,
        "sum_squared": 0.18252014406995135,
        "min": 0.42722376346588137,
        "max": 0.42722376346588137,
        "mean": 0.42722376346588137,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.42722376346588137,
        "sum_squared": 0.18252014406995135,
        "min": 0.42722376346588137,
        "max": 0.42722376346588137,
        "mean": 0.42722376346588137,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.16,
        "sum_squared": 0.0256,
        "min": 0.16,
        "max": 0.16,
        "mean": 0.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.16,
        "sum_squared": 0.0256,
        "min": 0.16,
        "max": 0.16,
        "mean": 0.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.18,
        "sum_squared": 0.0324,
        "min": 0.18,
        "max": 0.18,
        "mean": 0.18,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.18,
        "sum_squared": 0.0324,
        "min": 0.18,
        "max": 0.18,
        "mean": 0.18,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3280874914559125,
        "sum_squared": 0.10764140204983344,
        "min": 0.3280874914559125,
        "max": 0.3280874914559125,
        "mean": 0.3280874914559125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3280874914559125,
        "sum_squared": 0.10764140204983344,
        "min": 0.3280874914559125,
        "max": 0.3280874914559125,
        "mean": 0.3280874914559125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.58,
        "sum_squared": 31.136400000000002,
        "min": 5.58,
        "max": 5.58,
        "mean": 5.58,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.58,
        "sum_squared": 31.136400000000002,
        "min": 5.58,
        "max": 5.58,
        "mean": 5.58,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 25.04,
        "sum_squared": 627.0015999999999,
        "min": 25.04,
        "max": 25.04,
        "mean": 25.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 25.04,
        "sum_squared": 627.0015999999999,
        "min": 25.04,
        "max": 25.04,
        "mean": 25.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=closedbook,model=openai_gpt-4-1106-preview,additional_instructions=natural_qa_closedbook",
    "run_spec": {
      "name": "natural_qa:mode=closedbook,model=openai_gpt-4-1106-preview,additional_instructions=natural_qa_closedbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "closedbook"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-4-1106-preview",
        "model": "openai/gpt-4-1106-preview",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_closedbook"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 133.92,
        "sum_squared": 17934.566399999996,
        "min": 133.92,
        "max": 133.92,
        "mean": 133.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 11.28,
        "sum_squared": 127.23839999999998,
        "min": 11.28,
        "max": 11.28,
        "mean": 11.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 11.28,
        "sum_squared": 127.23839999999998,
        "min": 11.28,
        "max": 11.28,
        "mean": 11.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0325166082382202,
        "sum_squared": 1.0660905462877583,
        "min": 1.0325166082382202,
        "max": 1.0325166082382202,
        "mean": 1.0325166082382202,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.49767001553843654,
        "sum_squared": 0.24767544436602767,
        "min": 0.49767001553843654,
        "max": 0.49767001553843654,
        "mean": 0.49767001553843654,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 11.28,
        "sum_squared": 127.23839999999998,
        "min": 11.28,
        "max": 11.28,
        "mean": 11.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 51.3,
        "sum_squared": 2631.6899999999996,
        "min": 51.3,
        "max": 51.3,
        "mean": 51.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 133.92,
        "sum_squared": 17934.566399999996,
        "min": 133.92,
        "max": 133.92,
        "mean": 133.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 133.92,
        "sum_squared": 17934.566399999996,
        "min": 133.92,
        "max": 133.92,
        "mean": 133.92,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 11.28,
        "sum_squared": 127.23839999999998,
        "min": 11.28,
        "max": 11.28,
        "mean": 11.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 11.28,
        "sum_squared": 127.23839999999998,
        "min": 11.28,
        "max": 11.28,
        "mean": 11.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 11.28,
        "sum_squared": 127.23839999999998,
        "min": 11.28,
        "max": 11.28,
        "mean": 11.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 11.28,
        "sum_squared": 127.23839999999998,
        "min": 11.28,
        "max": 11.28,
        "mean": 11.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0325166082382202,
        "sum_squared": 1.0660905462877583,
        "min": 1.0325166082382202,
        "max": 1.0325166082382202,
        "mean": 1.0325166082382202,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0325166082382202,
        "sum_squared": 1.0660905462877583,
        "min": 1.0325166082382202,
        "max": 1.0325166082382202,
        "mean": 1.0325166082382202,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.3,
        "sum_squared": 0.09,
        "min": 0.3,
        "max": 0.3,
        "mean": 0.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.49767001553843654,
        "sum_squared": 0.24767544436602767,
        "min": 0.49767001553843654,
        "max": 0.49767001553843654,
        "mean": 0.49767001553843654,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.49767001553843654,
        "sum_squared": 0.24767544436602767,
        "min": 0.49767001553843654,
        "max": 0.49767001553843654,
        "mean": 0.49767001553843654,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 11.28,
        "sum_squared": 127.23839999999998,
        "min": 11.28,
        "max": 11.28,
        "mean": 11.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 11.28,
        "sum_squared": 127.23839999999998,
        "min": 11.28,
        "max": 11.28,
        "mean": 11.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 51.3,
        "sum_squared": 2631.6899999999996,
        "min": 51.3,
        "max": 51.3,
        "mean": 51.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 51.3,
        "sum_squared": 2631.6899999999996,
        "min": 51.3,
        "max": 51.3,
        "mean": 51.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=closedbook,model=qwen_qwen1.5-7b,additional_instructions=natural_qa_closedbook",
    "run_spec": {
      "name": "natural_qa:mode=closedbook,model=qwen_qwen1.5-7b,additional_instructions=natural_qa_closedbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "closedbook"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Answer each of the following questions with a short answer or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "Question: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/qwen1.5-7b",
        "model": "qwen/qwen1.5-7b",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_closedbook"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 141.06,
        "sum_squared": 19897.923600000002,
        "min": 141.06,
        "max": 141.06,
        "mean": 141.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.36131480216979983,
        "sum_squared": 0.13054838626700158,
        "min": 0.36131480216979983,
        "max": 0.36131480216979983,
        "mean": 0.36131480216979983,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.04,
        "sum_squared": 0.0016,
        "min": 0.04,
        "max": 0.04,
        "mean": 0.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.06,
        "sum_squared": 0.0036,
        "min": 0.06,
        "max": 0.06,
        "mean": 0.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.15392349726775958,
        "sum_squared": 0.023692443011137994,
        "min": 0.15392349726775958,
        "max": 0.15392349726775958,
        "mean": 0.15392349726775958,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 22.4,
        "sum_squared": 501.75999999999993,
        "min": 22.4,
        "max": 22.4,
        "mean": 22.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 141.06,
        "sum_squared": 19897.923600000002,
        "min": 141.06,
        "max": 141.06,
        "mean": 141.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 141.06,
        "sum_squared": 19897.923600000002,
        "min": 141.06,
        "max": 141.06,
        "mean": 141.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.36131480216979983,
        "sum_squared": 0.13054838626700158,
        "min": 0.36131480216979983,
        "max": 0.36131480216979983,
        "mean": 0.36131480216979983,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.36131480216979983,
        "sum_squared": 0.13054838626700158,
        "min": 0.36131480216979983,
        "max": 0.36131480216979983,
        "mean": 0.36131480216979983,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.04,
        "sum_squared": 0.0016,
        "min": 0.04,
        "max": 0.04,
        "mean": 0.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.04,
        "sum_squared": 0.0016,
        "min": 0.04,
        "max": 0.04,
        "mean": 0.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.06,
        "sum_squared": 0.0036,
        "min": 0.06,
        "max": 0.06,
        "mean": 0.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.06,
        "sum_squared": 0.0036,
        "min": 0.06,
        "max": 0.06,
        "mean": 0.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.15392349726775958,
        "sum_squared": 0.023692443011137994,
        "min": 0.15392349726775958,
        "max": 0.15392349726775958,
        "mean": 0.15392349726775958,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.15392349726775958,
        "sum_squared": 0.023692443011137994,
        "min": 0.15392349726775958,
        "max": 0.15392349726775958,
        "mean": 0.15392349726775958,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 22.4,
        "sum_squared": 501.75999999999993,
        "min": 22.4,
        "max": 22.4,
        "mean": 22.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 22.4,
        "sum_squared": 501.75999999999993,
        "min": 22.4,
        "max": 22.4,
        "mean": 22.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=openbook_longans,model=01-ai_yi-6b,additional_instructions=natural_qa_openbook",
    "run_spec": {
      "name": "natural_qa:mode=openbook_longans,model=01-ai_yi-6b,additional_instructions=natural_qa_openbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "openbook_longans"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Answer each of the following questions with a short answer that is a span within the passage or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/yi-6b",
        "model": "01-ai/yi-6b",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_openbook_longans"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 2303.98,
        "sum_squared": 5308323.8404,
        "min": 2303.98,
        "max": 2303.98,
        "mean": 2303.98,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.015028839111328,
        "sum_squared": 1.0302835442276903,
        "min": 1.015028839111328,
        "max": 1.015028839111328,
        "mean": 1.015028839111328,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 4.74,
        "sum_squared": 22.4676,
        "min": 4.74,
        "max": 4.74,
        "mean": 4.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.04,
        "sum_squared": 0.0016,
        "min": 0.04,
        "max": 0.04,
        "mean": 0.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.58,
        "sum_squared": 0.3364,
        "min": 0.58,
        "max": 0.58,
        "mean": 0.58,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.68,
        "sum_squared": 0.4624000000000001,
        "min": 0.68,
        "max": 0.68,
        "mean": 0.68,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.825775788439203,
        "sum_squared": 0.6819056527723872,
        "min": 0.825775788439203,
        "max": 0.825775788439203,
        "mean": 0.825775788439203,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 66.96,
        "sum_squared": 4483.641599999999,
        "min": 66.96,
        "max": 66.96,
        "mean": 66.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2303.98,
        "sum_squared": 5308323.8404,
        "min": 2303.98,
        "max": 2303.98,
        "mean": 2303.98,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2303.98,
        "sum_squared": 5308323.8404,
        "min": 2303.98,
        "max": 2303.98,
        "mean": 2303.98,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.015028839111328,
        "sum_squared": 1.0302835442276903,
        "min": 1.015028839111328,
        "max": 1.015028839111328,
        "mean": 1.015028839111328,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.015028839111328,
        "sum_squared": 1.0302835442276903,
        "min": 1.015028839111328,
        "max": 1.015028839111328,
        "mean": 1.015028839111328,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.74,
        "sum_squared": 22.4676,
        "min": 4.74,
        "max": 4.74,
        "mean": 4.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.74,
        "sum_squared": 22.4676,
        "min": 4.74,
        "max": 4.74,
        "mean": 4.74,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.04,
        "sum_squared": 0.0016,
        "min": 0.04,
        "max": 0.04,
        "mean": 0.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.04,
        "sum_squared": 0.0016,
        "min": 0.04,
        "max": 0.04,
        "mean": 0.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.58,
        "sum_squared": 0.3364,
        "min": 0.58,
        "max": 0.58,
        "mean": 0.58,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.58,
        "sum_squared": 0.3364,
        "min": 0.58,
        "max": 0.58,
        "mean": 0.58,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.68,
        "sum_squared": 0.4624000000000001,
        "min": 0.68,
        "max": 0.68,
        "mean": 0.68,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.68,
        "sum_squared": 0.4624000000000001,
        "min": 0.68,
        "max": 0.68,
        "mean": 0.68,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.825775788439203,
        "sum_squared": 0.6819056527723872,
        "min": 0.825775788439203,
        "max": 0.825775788439203,
        "mean": 0.825775788439203,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.825775788439203,
        "sum_squared": 0.6819056527723872,
        "min": 0.825775788439203,
        "max": 0.825775788439203,
        "mean": 0.825775788439203,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 66.96,
        "sum_squared": 4483.641599999999,
        "min": 66.96,
        "max": 66.96,
        "mean": 66.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 66.96,
        "sum_squared": 4483.641599999999,
        "min": 66.96,
        "max": 66.96,
        "mean": 66.96,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=openbook_longans,model=google_gemma-7b,additional_instructions=natural_qa_openbook",
    "run_spec": {
      "name": "natural_qa:mode=openbook_longans,model=google_gemma-7b,additional_instructions=natural_qa_openbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "openbook_longans"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Answer each of the following questions with a short answer that is a span within the passage or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/gemma-7b",
        "model": "google/gemma-7b",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_openbook_longans"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 2082.08,
        "sum_squared": 4335057.1263999995,
        "min": 2082.08,
        "max": 2082.08,
        "mean": 2082.08,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.42537864685058596,
        "sum_squared": 0.18094699319643553,
        "min": 0.42537864685058596,
        "max": 0.42537864685058596,
        "mean": 0.42537864685058596,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 4.86,
        "sum_squared": 23.619600000000002,
        "min": 4.86,
        "max": 4.86,
        "mean": 4.86,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.02,
        "sum_squared": 0.0004,
        "min": 0.02,
        "max": 0.02,
        "mean": 0.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.56,
        "sum_squared": 0.31360000000000005,
        "min": 0.56,
        "max": 0.56,
        "mean": 0.56,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.7265654135338346,
        "sum_squared": 0.5278973001435922,
        "min": 0.7265654135338346,
        "max": 0.7265654135338346,
        "mean": 0.7265654135338346,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 22.28,
        "sum_squared": 496.39840000000004,
        "min": 22.28,
        "max": 22.28,
        "mean": 22.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2082.08,
        "sum_squared": 4335057.1263999995,
        "min": 2082.08,
        "max": 2082.08,
        "mean": 2082.08,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2082.08,
        "sum_squared": 4335057.1263999995,
        "min": 2082.08,
        "max": 2082.08,
        "mean": 2082.08,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.42537864685058596,
        "sum_squared": 0.18094699319643553,
        "min": 0.42537864685058596,
        "max": 0.42537864685058596,
        "mean": 0.42537864685058596,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.42537864685058596,
        "sum_squared": 0.18094699319643553,
        "min": 0.42537864685058596,
        "max": 0.42537864685058596,
        "mean": 0.42537864685058596,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.86,
        "sum_squared": 23.619600000000002,
        "min": 4.86,
        "max": 4.86,
        "mean": 4.86,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.86,
        "sum_squared": 23.619600000000002,
        "min": 4.86,
        "max": 4.86,
        "mean": 4.86,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.02,
        "sum_squared": 0.0004,
        "min": 0.02,
        "max": 0.02,
        "mean": 0.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.02,
        "sum_squared": 0.0004,
        "min": 0.02,
        "max": 0.02,
        "mean": 0.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.46,
        "sum_squared": 0.2116,
        "min": 0.46,
        "max": 0.46,
        "mean": 0.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.56,
        "sum_squared": 0.31360000000000005,
        "min": 0.56,
        "max": 0.56,
        "mean": 0.56,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.56,
        "sum_squared": 0.31360000000000005,
        "min": 0.56,
        "max": 0.56,
        "mean": 0.56,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7265654135338346,
        "sum_squared": 0.5278973001435922,
        "min": 0.7265654135338346,
        "max": 0.7265654135338346,
        "mean": 0.7265654135338346,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7265654135338346,
        "sum_squared": 0.5278973001435922,
        "min": 0.7265654135338346,
        "max": 0.7265654135338346,
        "mean": 0.7265654135338346,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 22.28,
        "sum_squared": 496.39840000000004,
        "min": 22.28,
        "max": 22.28,
        "mean": 22.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 22.28,
        "sum_squared": 496.39840000000004,
        "min": 22.28,
        "max": 22.28,
        "mean": 22.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=openbook_longans,model=google_gemma-7b-it,additional_instructions=natural_qa_openbook",
    "run_spec": {
      "name": "natural_qa:mode=openbook_longans,model=google_gemma-7b-it,additional_instructions=natural_qa_openbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "openbook_longans"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Answer each of the following questions with a short answer that is a span within the passage or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/gemma-7b-it",
        "model": "google/gemma-7b-it",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_openbook_longans"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 2082.08,
        "sum_squared": 4335057.1263999995,
        "min": 2082.08,
        "max": 2082.08,
        "mean": 2082.08,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.4899437952041626,
        "sum_squared": 0.24004492245905842,
        "min": 0.4899437952041626,
        "max": 0.4899437952041626,
        "mean": 0.4899437952041626,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 4.86,
        "sum_squared": 23.619600000000002,
        "min": 4.86,
        "max": 4.86,
        "mean": 4.86,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.02,
        "sum_squared": 0.0004,
        "min": 0.02,
        "max": 0.02,
        "mean": 0.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.14,
        "sum_squared": 0.019600000000000003,
        "min": 0.14,
        "max": 0.14,
        "mean": 0.14,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.14,
        "sum_squared": 0.019600000000000003,
        "min": 0.14,
        "max": 0.14,
        "mean": 0.14,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.29723705845974746,
        "sum_squared": 0.08834986892180333,
        "min": 0.29723705845974746,
        "max": 0.29723705845974746,
        "mean": 0.29723705845974746,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 23.94,
        "sum_squared": 573.1236,
        "min": 23.94,
        "max": 23.94,
        "mean": 23.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2082.08,
        "sum_squared": 4335057.1263999995,
        "min": 2082.08,
        "max": 2082.08,
        "mean": 2082.08,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2082.08,
        "sum_squared": 4335057.1263999995,
        "min": 2082.08,
        "max": 2082.08,
        "mean": 2082.08,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4899437952041626,
        "sum_squared": 0.24004492245905842,
        "min": 0.4899437952041626,
        "max": 0.4899437952041626,
        "mean": 0.4899437952041626,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4899437952041626,
        "sum_squared": 0.24004492245905842,
        "min": 0.4899437952041626,
        "max": 0.4899437952041626,
        "mean": 0.4899437952041626,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.86,
        "sum_squared": 23.619600000000002,
        "min": 4.86,
        "max": 4.86,
        "mean": 4.86,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.86,
        "sum_squared": 23.619600000000002,
        "min": 4.86,
        "max": 4.86,
        "mean": 4.86,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.02,
        "sum_squared": 0.0004,
        "min": 0.02,
        "max": 0.02,
        "mean": 0.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.02,
        "sum_squared": 0.0004,
        "min": 0.02,
        "max": 0.02,
        "mean": 0.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.14,
        "sum_squared": 0.019600000000000003,
        "min": 0.14,
        "max": 0.14,
        "mean": 0.14,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.14,
        "sum_squared": 0.019600000000000003,
        "min": 0.14,
        "max": 0.14,
        "mean": 0.14,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.14,
        "sum_squared": 0.019600000000000003,
        "min": 0.14,
        "max": 0.14,
        "mean": 0.14,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.14,
        "sum_squared": 0.019600000000000003,
        "min": 0.14,
        "max": 0.14,
        "mean": 0.14,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29723705845974746,
        "sum_squared": 0.08834986892180333,
        "min": 0.29723705845974746,
        "max": 0.29723705845974746,
        "mean": 0.29723705845974746,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.29723705845974746,
        "sum_squared": 0.08834986892180333,
        "min": 0.29723705845974746,
        "max": 0.29723705845974746,
        "mean": 0.29723705845974746,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 23.94,
        "sum_squared": 573.1236,
        "min": 23.94,
        "max": 23.94,
        "mean": 23.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 23.94,
        "sum_squared": 573.1236,
        "min": 23.94,
        "max": 23.94,
        "mean": 23.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=openbook_longans,model=google_text-bison@001,additional_instructions=natural_qa_openbook",
    "run_spec": {
      "name": "natural_qa:mode=openbook_longans,model=google_text-bison@001,additional_instructions=natural_qa_openbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "openbook_longans"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Answer each of the following questions with a short answer that is a span within the passage or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/text-bison@001",
        "model": "google/text-bison@001",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_openbook_longans"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 2146.7,
        "sum_squared": 4608320.89,
        "min": 2146.7,
        "max": 2146.7,
        "mean": 2146.7,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0056433248519898,
        "sum_squared": 1.0113184968193647,
        "min": 1.0056433248519898,
        "max": 1.0056433248519898,
        "mean": 1.0056433248519898,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 4.8,
        "sum_squared": 23.04,
        "min": 4.8,
        "max": 4.8,
        "mean": 4.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.04,
        "sum_squared": 0.0016,
        "min": 0.04,
        "max": 0.04,
        "mean": 0.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.7,
        "sum_squared": 0.48999999999999994,
        "min": 0.7,
        "max": 0.7,
        "mean": 0.7,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.72,
        "sum_squared": 0.5184,
        "min": 0.72,
        "max": 0.72,
        "mean": 0.72,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.8163155929038283,
        "sum_squared": 0.6663711472179287,
        "min": 0.8163155929038283,
        "max": 0.8163155929038283,
        "mean": 0.8163155929038283,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2146.7,
        "sum_squared": 4608320.89,
        "min": 2146.7,
        "max": 2146.7,
        "mean": 2146.7,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2146.7,
        "sum_squared": 4608320.89,
        "min": 2146.7,
        "max": 2146.7,
        "mean": 2146.7,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0056433248519898,
        "sum_squared": 1.0113184968193647,
        "min": 1.0056433248519898,
        "max": 1.0056433248519898,
        "mean": 1.0056433248519898,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0056433248519898,
        "sum_squared": 1.0113184968193647,
        "min": 1.0056433248519898,
        "max": 1.0056433248519898,
        "mean": 1.0056433248519898,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.8,
        "sum_squared": 23.04,
        "min": 4.8,
        "max": 4.8,
        "mean": 4.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.8,
        "sum_squared": 23.04,
        "min": 4.8,
        "max": 4.8,
        "mean": 4.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.04,
        "sum_squared": 0.0016,
        "min": 0.04,
        "max": 0.04,
        "mean": 0.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.04,
        "sum_squared": 0.0016,
        "min": 0.04,
        "max": 0.04,
        "mean": 0.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7,
        "sum_squared": 0.48999999999999994,
        "min": 0.7,
        "max": 0.7,
        "mean": 0.7,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7,
        "sum_squared": 0.48999999999999994,
        "min": 0.7,
        "max": 0.7,
        "mean": 0.7,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.72,
        "sum_squared": 0.5184,
        "min": 0.72,
        "max": 0.72,
        "mean": 0.72,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.72,
        "sum_squared": 0.5184,
        "min": 0.72,
        "max": 0.72,
        "mean": 0.72,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8163155929038283,
        "sum_squared": 0.6663711472179287,
        "min": 0.8163155929038283,
        "max": 0.8163155929038283,
        "mean": 0.8163155929038283,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8163155929038283,
        "sum_squared": 0.6663711472179287,
        "min": 0.8163155929038283,
        "max": 0.8163155929038283,
        "mean": 0.8163155929038283,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=openbook_longans,model=google_text-unicorn@001,additional_instructions=natural_qa_openbook",
    "run_spec": {
      "name": "natural_qa:mode=openbook_longans,model=google_text-unicorn@001,additional_instructions=natural_qa_openbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "openbook_longans"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Answer each of the following questions with a short answer that is a span within the passage or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/text-unicorn@001",
        "model": "google/text-unicorn@001",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_openbook_longans"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 2146.7,
        "sum_squared": 4608320.89,
        "min": 2146.7,
        "max": 2146.7,
        "mean": 2146.7,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 3.2453894138336183,
        "sum_squared": 10.532552447423317,
        "min": 3.2453894138336183,
        "max": 3.2453894138336183,
        "mean": 3.2453894138336183,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 4.8,
        "sum_squared": 23.04,
        "min": 4.8,
        "max": 4.8,
        "mean": 4.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.04,
        "sum_squared": 0.0016,
        "min": 0.04,
        "max": 0.04,
        "mean": 0.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.64,
        "sum_squared": 0.4096,
        "min": 0.64,
        "max": 0.64,
        "mean": 0.64,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.66,
        "sum_squared": 0.43560000000000004,
        "min": 0.66,
        "max": 0.66,
        "mean": 0.66,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.8127808145660421,
        "sum_squared": 0.6606126525266389,
        "min": 0.8127808145660421,
        "max": 0.8127808145660421,
        "mean": 0.8127808145660421,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2146.7,
        "sum_squared": 4608320.89,
        "min": 2146.7,
        "max": 2146.7,
        "mean": 2146.7,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2146.7,
        "sum_squared": 4608320.89,
        "min": 2146.7,
        "max": 2146.7,
        "mean": 2146.7,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.2453894138336183,
        "sum_squared": 10.532552447423317,
        "min": 3.2453894138336183,
        "max": 3.2453894138336183,
        "mean": 3.2453894138336183,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.2453894138336183,
        "sum_squared": 10.532552447423317,
        "min": 3.2453894138336183,
        "max": 3.2453894138336183,
        "mean": 3.2453894138336183,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.8,
        "sum_squared": 23.04,
        "min": 4.8,
        "max": 4.8,
        "mean": 4.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.8,
        "sum_squared": 23.04,
        "min": 4.8,
        "max": 4.8,
        "mean": 4.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.04,
        "sum_squared": 0.0016,
        "min": 0.04,
        "max": 0.04,
        "mean": 0.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.04,
        "sum_squared": 0.0016,
        "min": 0.04,
        "max": 0.04,
        "mean": 0.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.64,
        "sum_squared": 0.4096,
        "min": 0.64,
        "max": 0.64,
        "mean": 0.64,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.64,
        "sum_squared": 0.4096,
        "min": 0.64,
        "max": 0.64,
        "mean": 0.64,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.66,
        "sum_squared": 0.43560000000000004,
        "min": 0.66,
        "max": 0.66,
        "mean": 0.66,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.66,
        "sum_squared": 0.43560000000000004,
        "min": 0.66,
        "max": 0.66,
        "mean": 0.66,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8127808145660421,
        "sum_squared": 0.6606126525266389,
        "min": 0.8127808145660421,
        "max": 0.8127808145660421,
        "mean": 0.8127808145660421,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8127808145660421,
        "sum_squared": 0.6606126525266389,
        "min": 0.8127808145660421,
        "max": 0.8127808145660421,
        "mean": 0.8127808145660421,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=openbook_longans,model=meta_llama-2-7b,additional_instructions=natural_qa_openbook",
    "run_spec": {
      "name": "natural_qa:mode=openbook_longans,model=meta_llama-2-7b,additional_instructions=natural_qa_openbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "openbook_longans"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Answer each of the following questions with a short answer that is a span within the passage or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-2-7b",
        "model": "meta/llama-2-7b",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_openbook_longans"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 2406.28,
        "sum_squared": 5790183.438400001,
        "min": 2406.28,
        "max": 2406.28,
        "mean": 2406.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.9379725503921509,
        "sum_squared": 0.879792505289156,
        "min": 0.9379725503921509,
        "max": 0.9379725503921509,
        "mean": 0.9379725503921509,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 4.72,
        "sum_squared": 22.278399999999998,
        "min": 4.72,
        "max": 4.72,
        "mean": 4.72,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.04,
        "sum_squared": 0.0016,
        "min": 0.04,
        "max": 0.04,
        "mean": 0.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.44,
        "sum_squared": 0.1936,
        "min": 0.44,
        "max": 0.44,
        "mean": 0.44,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.6402291275975487,
        "sum_squared": 0.40989333582431825,
        "min": 0.6402291275975487,
        "max": 0.6402291275975487,
        "mean": 0.6402291275975487,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 38.54,
        "sum_squared": 1485.3316,
        "min": 38.54,
        "max": 38.54,
        "mean": 38.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2406.28,
        "sum_squared": 5790183.438400001,
        "min": 2406.28,
        "max": 2406.28,
        "mean": 2406.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2406.28,
        "sum_squared": 5790183.438400001,
        "min": 2406.28,
        "max": 2406.28,
        "mean": 2406.28,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9379725503921509,
        "sum_squared": 0.879792505289156,
        "min": 0.9379725503921509,
        "max": 0.9379725503921509,
        "mean": 0.9379725503921509,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9379725503921509,
        "sum_squared": 0.879792505289156,
        "min": 0.9379725503921509,
        "max": 0.9379725503921509,
        "mean": 0.9379725503921509,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.72,
        "sum_squared": 22.278399999999998,
        "min": 4.72,
        "max": 4.72,
        "mean": 4.72,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.72,
        "sum_squared": 22.278399999999998,
        "min": 4.72,
        "max": 4.72,
        "mean": 4.72,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.04,
        "sum_squared": 0.0016,
        "min": 0.04,
        "max": 0.04,
        "mean": 0.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.04,
        "sum_squared": 0.0016,
        "min": 0.04,
        "max": 0.04,
        "mean": 0.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4,
        "sum_squared": 0.16000000000000003,
        "min": 0.4,
        "max": 0.4,
        "mean": 0.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.44,
        "sum_squared": 0.1936,
        "min": 0.44,
        "max": 0.44,
        "mean": 0.44,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.44,
        "sum_squared": 0.1936,
        "min": 0.44,
        "max": 0.44,
        "mean": 0.44,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6402291275975487,
        "sum_squared": 0.40989333582431825,
        "min": 0.6402291275975487,
        "max": 0.6402291275975487,
        "mean": 0.6402291275975487,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6402291275975487,
        "sum_squared": 0.40989333582431825,
        "min": 0.6402291275975487,
        "max": 0.6402291275975487,
        "mean": 0.6402291275975487,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 38.54,
        "sum_squared": 1485.3316,
        "min": 38.54,
        "max": 38.54,
        "mean": 38.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 38.54,
        "sum_squared": 1485.3316,
        "min": 38.54,
        "max": 38.54,
        "mean": 38.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=openbook_longans,model=microsoft_phi-2,additional_instructions=natural_qa_openbook",
    "run_spec": {
      "name": "natural_qa:mode=openbook_longans,model=microsoft_phi-2,additional_instructions=natural_qa_openbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "openbook_longans"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Answer each of the following questions with a short answer that is a span within the passage or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/phi-2",
        "model": "microsoft/phi-2",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_openbook_longans"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1523.78,
        "sum_squared": 2321905.4884,
        "min": 1523.78,
        "max": 1523.78,
        "mean": 1523.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.48971110343933105,
        "sum_squared": 0.2398169648317672,
        "min": 0.48971110343933105,
        "max": 0.48971110343933105,
        "mean": 0.48971110343933105,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 4.46,
        "sum_squared": 19.8916,
        "min": 4.46,
        "max": 4.46,
        "mean": 4.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.06,
        "sum_squared": 0.0036,
        "min": 0.06,
        "max": 0.06,
        "mean": 0.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.44,
        "sum_squared": 0.1936,
        "min": 0.44,
        "max": 0.44,
        "mean": 0.44,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.7359798534798534,
        "sum_squared": 0.5416663447282265,
        "min": 0.7359798534798534,
        "max": 0.7359798534798534,
        "mean": 0.7359798534798534,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 29.08,
        "sum_squared": 845.6463999999999,
        "min": 29.08,
        "max": 29.08,
        "mean": 29.08,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1523.78,
        "sum_squared": 2321905.4884,
        "min": 1523.78,
        "max": 1523.78,
        "mean": 1523.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1523.78,
        "sum_squared": 2321905.4884,
        "min": 1523.78,
        "max": 1523.78,
        "mean": 1523.78,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48971110343933105,
        "sum_squared": 0.2398169648317672,
        "min": 0.48971110343933105,
        "max": 0.48971110343933105,
        "mean": 0.48971110343933105,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48971110343933105,
        "sum_squared": 0.2398169648317672,
        "min": 0.48971110343933105,
        "max": 0.48971110343933105,
        "mean": 0.48971110343933105,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.46,
        "sum_squared": 19.8916,
        "min": 4.46,
        "max": 4.46,
        "mean": 4.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.46,
        "sum_squared": 19.8916,
        "min": 4.46,
        "max": 4.46,
        "mean": 4.46,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.06,
        "sum_squared": 0.0036,
        "min": 0.06,
        "max": 0.06,
        "mean": 0.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.06,
        "sum_squared": 0.0036,
        "min": 0.06,
        "max": 0.06,
        "mean": 0.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.44,
        "sum_squared": 0.1936,
        "min": 0.44,
        "max": 0.44,
        "mean": 0.44,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.44,
        "sum_squared": 0.1936,
        "min": 0.44,
        "max": 0.44,
        "mean": 0.44,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7359798534798534,
        "sum_squared": 0.5416663447282265,
        "min": 0.7359798534798534,
        "max": 0.7359798534798534,
        "mean": 0.7359798534798534,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7359798534798534,
        "sum_squared": 0.5416663447282265,
        "min": 0.7359798534798534,
        "max": 0.7359798534798534,
        "mean": 0.7359798534798534,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 29.08,
        "sum_squared": 845.6463999999999,
        "min": 29.08,
        "max": 29.08,
        "mean": 29.08,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 29.08,
        "sum_squared": 845.6463999999999,
        "min": 29.08,
        "max": 29.08,
        "mean": 29.08,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=openbook_longans,model=mistralai_mixtral-8x7b-32kseqlen,additional_instructions=natural_qa_openbook",
    "run_spec": {
      "name": "natural_qa:mode=openbook_longans,model=mistralai_mixtral-8x7b-32kseqlen,additional_instructions=natural_qa_openbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "openbook_longans"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Answer each of the following questions with a short answer that is a span within the passage or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/mixtral-8x7b-32kseqlen",
        "model": "mistralai/mixtral-8x7b-32kseqlen",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_openbook_longans"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 2385.14,
        "sum_squared": 5688892.819599999,
        "min": 2385.14,
        "max": 2385.14,
        "mean": 2385.14,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.6702319574356079,
        "sum_squared": 0.44921087676796656,
        "min": 0.6702319574356079,
        "max": 0.6702319574356079,
        "mean": 0.6702319574356079,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 4.72,
        "sum_squared": 22.278399999999998,
        "min": 4.72,
        "max": 4.72,
        "mean": 4.72,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.04,
        "sum_squared": 0.0016,
        "min": 0.04,
        "max": 0.04,
        "mean": 0.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.7447166955851166,
        "sum_squared": 0.5546029566832151,
        "min": 0.7447166955851166,
        "max": 0.7447166955851166,
        "mean": 0.7447166955851166,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 19.52,
        "sum_squared": 381.0304,
        "min": 19.52,
        "max": 19.52,
        "mean": 19.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2385.14,
        "sum_squared": 5688892.819599999,
        "min": 2385.14,
        "max": 2385.14,
        "mean": 2385.14,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2385.14,
        "sum_squared": 5688892.819599999,
        "min": 2385.14,
        "max": 2385.14,
        "mean": 2385.14,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6702319574356079,
        "sum_squared": 0.44921087676796656,
        "min": 0.6702319574356079,
        "max": 0.6702319574356079,
        "mean": 0.6702319574356079,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6702319574356079,
        "sum_squared": 0.44921087676796656,
        "min": 0.6702319574356079,
        "max": 0.6702319574356079,
        "mean": 0.6702319574356079,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.72,
        "sum_squared": 22.278399999999998,
        "min": 4.72,
        "max": 4.72,
        "mean": 4.72,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.72,
        "sum_squared": 22.278399999999998,
        "min": 4.72,
        "max": 4.72,
        "mean": 4.72,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.04,
        "sum_squared": 0.0016,
        "min": 0.04,
        "max": 0.04,
        "mean": 0.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.04,
        "sum_squared": 0.0016,
        "min": 0.04,
        "max": 0.04,
        "mean": 0.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7447166955851166,
        "sum_squared": 0.5546029566832151,
        "min": 0.7447166955851166,
        "max": 0.7447166955851166,
        "mean": 0.7447166955851166,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7447166955851166,
        "sum_squared": 0.5546029566832151,
        "min": 0.7447166955851166,
        "max": 0.7447166955851166,
        "mean": 0.7447166955851166,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 19.52,
        "sum_squared": 381.0304,
        "min": 19.52,
        "max": 19.52,
        "mean": 19.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 19.52,
        "sum_squared": 381.0304,
        "min": 19.52,
        "max": 19.52,
        "mean": 19.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=openbook_longans,model=openai_gpt-3.5-turbo-0613,additional_instructions=natural_qa_openbook",
    "run_spec": {
      "name": "natural_qa:mode=openbook_longans,model=openai_gpt-3.5-turbo-0613,additional_instructions=natural_qa_openbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "openbook_longans"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Answer each of the following questions with a short answer that is a span within the passage or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-3.5-turbo-0613",
        "model": "openai/gpt-3.5-turbo-0613",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_openbook_longans"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1708.94,
        "sum_squared": 2920475.9236000003,
        "min": 1708.94,
        "max": 1708.94,
        "mean": 1708.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 9.16,
        "sum_squared": 83.9056,
        "min": 9.16,
        "max": 9.16,
        "mean": 9.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 9.16,
        "sum_squared": 83.9056,
        "min": 9.16,
        "max": 9.16,
        "mean": 9.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.5898475360870361,
        "sum_squared": 0.34792011582794735,
        "min": 0.5898475360870361,
        "max": 0.5898475360870361,
        "mean": 0.5898475360870361,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 4.8,
        "sum_squared": 23.04,
        "min": 4.8,
        "max": 4.8,
        "mean": 4.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.04,
        "sum_squared": 0.0016,
        "min": 0.04,
        "max": 0.04,
        "mean": 0.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.44,
        "sum_squared": 0.1936,
        "min": 0.44,
        "max": 0.44,
        "mean": 0.44,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.7565861124427687,
        "sum_squared": 0.5724225455412619,
        "min": 0.7565861124427687,
        "max": 0.7565861124427687,
        "mean": 0.7565861124427687,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 9.16,
        "sum_squared": 83.9056,
        "min": 9.16,
        "max": 9.16,
        "mean": 9.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 38.56,
        "sum_squared": 1486.8736000000001,
        "min": 38.56,
        "max": 38.56,
        "mean": 38.56,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1708.94,
        "sum_squared": 2920475.9236000003,
        "min": 1708.94,
        "max": 1708.94,
        "mean": 1708.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1708.94,
        "sum_squared": 2920475.9236000003,
        "min": 1708.94,
        "max": 1708.94,
        "mean": 1708.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 9.16,
        "sum_squared": 83.9056,
        "min": 9.16,
        "max": 9.16,
        "mean": 9.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 9.16,
        "sum_squared": 83.9056,
        "min": 9.16,
        "max": 9.16,
        "mean": 9.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 9.16,
        "sum_squared": 83.9056,
        "min": 9.16,
        "max": 9.16,
        "mean": 9.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 9.16,
        "sum_squared": 83.9056,
        "min": 9.16,
        "max": 9.16,
        "mean": 9.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5898475360870361,
        "sum_squared": 0.34792011582794735,
        "min": 0.5898475360870361,
        "max": 0.5898475360870361,
        "mean": 0.5898475360870361,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5898475360870361,
        "sum_squared": 0.34792011582794735,
        "min": 0.5898475360870361,
        "max": 0.5898475360870361,
        "mean": 0.5898475360870361,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.8,
        "sum_squared": 23.04,
        "min": 4.8,
        "max": 4.8,
        "mean": 4.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.8,
        "sum_squared": 23.04,
        "min": 4.8,
        "max": 4.8,
        "mean": 4.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.04,
        "sum_squared": 0.0016,
        "min": 0.04,
        "max": 0.04,
        "mean": 0.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.04,
        "sum_squared": 0.0016,
        "min": 0.04,
        "max": 0.04,
        "mean": 0.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.44,
        "sum_squared": 0.1936,
        "min": 0.44,
        "max": 0.44,
        "mean": 0.44,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.44,
        "sum_squared": 0.1936,
        "min": 0.44,
        "max": 0.44,
        "mean": 0.44,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7565861124427687,
        "sum_squared": 0.5724225455412619,
        "min": 0.7565861124427687,
        "max": 0.7565861124427687,
        "mean": 0.7565861124427687,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7565861124427687,
        "sum_squared": 0.5724225455412619,
        "min": 0.7565861124427687,
        "max": 0.7565861124427687,
        "mean": 0.7565861124427687,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 9.16,
        "sum_squared": 83.9056,
        "min": 9.16,
        "max": 9.16,
        "mean": 9.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 9.16,
        "sum_squared": 83.9056,
        "min": 9.16,
        "max": 9.16,
        "mean": 9.16,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 38.56,
        "sum_squared": 1486.8736000000001,
        "min": 38.56,
        "max": 38.56,
        "mean": 38.56,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 38.56,
        "sum_squared": 1486.8736000000001,
        "min": 38.56,
        "max": 38.56,
        "mean": 38.56,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=openbook_longans,model=openai_gpt-4-1106-preview,additional_instructions=natural_qa_openbook",
    "run_spec": {
      "name": "natural_qa:mode=openbook_longans,model=openai_gpt-4-1106-preview,additional_instructions=natural_qa_openbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "openbook_longans"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Answer each of the following questions with a short answer that is a span within the passage or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-4-1106-preview",
        "model": "openai/gpt-4-1106-preview",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_openbook_longans"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1902.54,
        "sum_squared": 3619658.4516,
        "min": 1902.54,
        "max": 1902.54,
        "mean": 1902.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 8.18,
        "sum_squared": 66.91239999999999,
        "min": 8.18,
        "max": 8.18,
        "mean": 8.18,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 8.18,
        "sum_squared": 66.91239999999999,
        "min": 8.18,
        "max": 8.18,
        "mean": 8.18,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.2700903177261353,
        "sum_squared": 1.6131294151816755,
        "min": 1.2700903177261353,
        "max": 1.2700903177261353,
        "mean": 1.2700903177261353,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.66,
        "sum_squared": 0.43560000000000004,
        "min": 0.66,
        "max": 0.66,
        "mean": 0.66,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.8720817112883674,
        "sum_squared": 0.7605265111636474,
        "min": 0.8720817112883674,
        "max": 0.8720817112883674,
        "mean": 0.8720817112883674,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 8.18,
        "sum_squared": 66.91239999999999,
        "min": 8.18,
        "max": 8.18,
        "mean": 8.18,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 32.4,
        "sum_squared": 1049.76,
        "min": 32.4,
        "max": 32.4,
        "mean": 32.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1902.54,
        "sum_squared": 3619658.4516,
        "min": 1902.54,
        "max": 1902.54,
        "mean": 1902.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1902.54,
        "sum_squared": 3619658.4516,
        "min": 1902.54,
        "max": 1902.54,
        "mean": 1902.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 8.18,
        "sum_squared": 66.91239999999999,
        "min": 8.18,
        "max": 8.18,
        "mean": 8.18,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 8.18,
        "sum_squared": 66.91239999999999,
        "min": 8.18,
        "max": 8.18,
        "mean": 8.18,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 8.18,
        "sum_squared": 66.91239999999999,
        "min": 8.18,
        "max": 8.18,
        "mean": 8.18,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 8.18,
        "sum_squared": 66.91239999999999,
        "min": 8.18,
        "max": 8.18,
        "mean": 8.18,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.2700903177261353,
        "sum_squared": 1.6131294151816755,
        "min": 1.2700903177261353,
        "max": 1.2700903177261353,
        "mean": 1.2700903177261353,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.2700903177261353,
        "sum_squared": 1.6131294151816755,
        "min": 1.2700903177261353,
        "max": 1.2700903177261353,
        "mean": 1.2700903177261353,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.66,
        "sum_squared": 0.43560000000000004,
        "min": 0.66,
        "max": 0.66,
        "mean": 0.66,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.66,
        "sum_squared": 0.43560000000000004,
        "min": 0.66,
        "max": 0.66,
        "mean": 0.66,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8720817112883674,
        "sum_squared": 0.7605265111636474,
        "min": 0.8720817112883674,
        "max": 0.8720817112883674,
        "mean": 0.8720817112883674,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8720817112883674,
        "sum_squared": 0.7605265111636474,
        "min": 0.8720817112883674,
        "max": 0.8720817112883674,
        "mean": 0.8720817112883674,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 8.18,
        "sum_squared": 66.91239999999999,
        "min": 8.18,
        "max": 8.18,
        "mean": 8.18,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 8.18,
        "sum_squared": 66.91239999999999,
        "min": 8.18,
        "max": 8.18,
        "mean": 8.18,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 32.4,
        "sum_squared": 1049.76,
        "min": 32.4,
        "max": 32.4,
        "mean": 32.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 32.4,
        "sum_squared": 1049.76,
        "min": 32.4,
        "max": 32.4,
        "mean": 32.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/mmlu_yifan/natural_qa:mode=openbook_longans,model=qwen_qwen1.5-7b,additional_instructions=natural_qa_openbook",
    "run_spec": {
      "name": "natural_qa:mode=openbook_longans,model=qwen_qwen1.5-7b,additional_instructions=natural_qa_openbook",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.natural_qa_scenario.NaturalQAScenario",
        "args": {
          "mode": "openbook_longans"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Answer each of the following questions with a short answer that is a span within the passage or a boolean 'yes' or 'no' answer.\n",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "Answer: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 50,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/qwen1.5-7b",
        "model": "qwen/qwen1.5-7b",
        "temperature": 0.0,
        "max_tokens": 300,
        "stop_sequences": [
          "\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "f1_score"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "natural_qa_openbook_longans"
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 2206.24,
        "sum_squared": 4867494.937599999,
        "min": 2206.24,
        "max": 2206.24,
        "mean": 2206.24,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.5468538618087768,
        "sum_squared": 0.29904914617517275,
        "min": 0.5468538618087768,
        "max": 0.5468538618087768,
        "mean": 0.5468538618087768,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.7825500289184498,
        "sum_squared": 0.6123845477602666,
        "min": 0.7825500289184498,
        "max": 0.7825500289184498,
        "mean": 0.7825500289184498,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 33.82,
        "sum_squared": 1143.7924,
        "min": 33.82,
        "max": 33.82,
        "mean": 33.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.06,
        "sum_squared": 4.2436,
        "min": 2.06,
        "max": 2.06,
        "mean": 2.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2206.24,
        "sum_squared": 4867494.937599999,
        "min": 2206.24,
        "max": 2206.24,
        "mean": 2206.24,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2206.24,
        "sum_squared": 4867494.937599999,
        "min": 2206.24,
        "max": 2206.24,
        "mean": 2206.24,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5468538618087768,
        "sum_squared": 0.29904914617517275,
        "min": 0.5468538618087768,
        "max": 0.5468538618087768,
        "mean": 0.5468538618087768,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5468538618087768,
        "sum_squared": 0.29904914617517275,
        "min": 0.5468538618087768,
        "max": 0.5468538618087768,
        "mean": 0.5468538618087768,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.52,
        "sum_squared": 0.27040000000000003,
        "min": 0.52,
        "max": 0.52,
        "mean": 0.52,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7825500289184498,
        "sum_squared": 0.6123845477602666,
        "min": 0.7825500289184498,
        "max": 0.7825500289184498,
        "mean": 0.7825500289184498,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "f1_score",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7825500289184498,
        "sum_squared": 0.6123845477602666,
        "min": 0.7825500289184498,
        "max": 0.7825500289184498,
        "mean": 0.7825500289184498,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 33.82,
        "sum_squared": 1143.7924,
        "min": 33.82,
        "max": 33.82,
        "mean": 33.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 33.82,
        "sum_squared": 1143.7924,
        "min": 33.82,
        "max": 33.82,
        "mean": 33.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 50.0,
        "sum_squared": 2500.0,
        "min": 50.0,
        "max": 50.0,
        "mean": 50.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  }
]