[
  {
    "run_path": "benchmark_output/runs/v0.5.0/bird_sql:model=anthropic_claude-3-5-haiku-20241022",
    "run_spec": {
      "name": "bird_sql:model=anthropic_claude-3-5-haiku-20241022",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.bird_sql_scenario.BIRDSQLScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 100,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-3-5-haiku-20241022",
        "model": "anthropic/claude-3-5-haiku-20241022",
        "temperature": 0.0,
        "max_tokens": 1024,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.bird_sql_metrics.BirdSQLMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "bird_sql"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.bird_sql_annotator.BirdSQLAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1102.77,
        "sum_squared": 1216101.6729,
        "min": 1102.77,
        "max": 1102.77,
        "mean": 1102.77,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 197.73,
        "sum_squared": 39097.152899999994,
        "min": 197.73,
        "max": 197.73,
        "mean": 197.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 197.73,
        "sum_squared": 39097.152899999994,
        "min": 197.73,
        "max": 197.73,
        "mean": 197.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 4.15711697101593,
        "sum_squared": 17.28162151070866,
        "min": 4.15711697101593,
        "max": 4.15711697101593,
        "mean": 4.15711697101593,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 197.73,
        "sum_squared": 39097.152899999994,
        "min": 197.73,
        "max": 197.73,
        "mean": 197.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 692.61,
        "sum_squared": 479708.6121,
        "min": 692.61,
        "max": 692.61,
        "mean": 692.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1102.77,
        "sum_squared": 1216101.6729,
        "min": 1102.77,
        "max": 1102.77,
        "mean": 1102.77,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1102.77,
        "sum_squared": 1216101.6729,
        "min": 1102.77,
        "max": 1102.77,
        "mean": 1102.77,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 197.73,
        "sum_squared": 39097.152899999994,
        "min": 197.73,
        "max": 197.73,
        "mean": 197.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 197.73,
        "sum_squared": 39097.152899999994,
        "min": 197.73,
        "max": 197.73,
        "mean": 197.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 197.73,
        "sum_squared": 39097.152899999994,
        "min": 197.73,
        "max": 197.73,
        "mean": 197.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 197.73,
        "sum_squared": 39097.152899999994,
        "min": 197.73,
        "max": 197.73,
        "mean": 197.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.15711697101593,
        "sum_squared": 17.28162151070866,
        "min": 4.15711697101593,
        "max": 4.15711697101593,
        "mean": 4.15711697101593,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.15711697101593,
        "sum_squared": 17.28162151070866,
        "min": 4.15711697101593,
        "max": 4.15711697101593,
        "mean": 4.15711697101593,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 197.73,
        "sum_squared": 39097.152899999994,
        "min": 197.73,
        "max": 197.73,
        "mean": 197.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 197.73,
        "sum_squared": 39097.152899999994,
        "min": 197.73,
        "max": 197.73,
        "mean": 197.73,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 692.61,
        "sum_squared": 479708.6121,
        "min": 692.61,
        "max": 692.61,
        "mean": 692.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 692.61,
        "sum_squared": 479708.6121,
        "min": 692.61,
        "max": 692.61,
        "mean": 692.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.48,
        "sum_squared": 0.2304,
        "min": 0.48,
        "max": 0.48,
        "mean": 0.48,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.5.0/bird_sql:model=anthropic_claude-3-5-sonnet-20240620",
    "run_spec": {
      "name": "bird_sql:model=anthropic_claude-3-5-sonnet-20240620",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.bird_sql_scenario.BIRDSQLScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 100,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-3-5-sonnet-20240620",
        "model": "anthropic/claude-3-5-sonnet-20240620",
        "temperature": 0.0,
        "max_tokens": 1024,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.bird_sql_metrics.BirdSQLMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "bird_sql"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.bird_sql_annotator.BirdSQLAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1102.77,
        "sum_squared": 1216101.6729,
        "min": 1102.77,
        "max": 1102.77,
        "mean": 1102.77,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 232.02,
        "sum_squared": 53833.2804,
        "min": 232.02,
        "max": 232.02,
        "mean": 232.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 232.02,
        "sum_squared": 53833.2804,
        "min": 232.02,
        "max": 232.02,
        "mean": 232.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 4.278286309242248,
        "sum_squared": 18.30373374384966,
        "min": 4.278286309242248,
        "max": 4.278286309242248,
        "mean": 4.278286309242248,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 232.02,
        "sum_squared": 53833.2804,
        "min": 232.02,
        "max": 232.02,
        "mean": 232.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 834.58,
        "sum_squared": 696523.7764000001,
        "min": 834.58,
        "max": 834.58,
        "mean": 834.58,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1102.77,
        "sum_squared": 1216101.6729,
        "min": 1102.77,
        "max": 1102.77,
        "mean": 1102.77,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1102.77,
        "sum_squared": 1216101.6729,
        "min": 1102.77,
        "max": 1102.77,
        "mean": 1102.77,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 232.02,
        "sum_squared": 53833.2804,
        "min": 232.02,
        "max": 232.02,
        "mean": 232.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 232.02,
        "sum_squared": 53833.2804,
        "min": 232.02,
        "max": 232.02,
        "mean": 232.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 232.02,
        "sum_squared": 53833.2804,
        "min": 232.02,
        "max": 232.02,
        "mean": 232.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 232.02,
        "sum_squared": 53833.2804,
        "min": 232.02,
        "max": 232.02,
        "mean": 232.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.278286309242248,
        "sum_squared": 18.30373374384966,
        "min": 4.278286309242248,
        "max": 4.278286309242248,
        "mean": 4.278286309242248,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.278286309242248,
        "sum_squared": 18.30373374384966,
        "min": 4.278286309242248,
        "max": 4.278286309242248,
        "mean": 4.278286309242248,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 232.02,
        "sum_squared": 53833.2804,
        "min": 232.02,
        "max": 232.02,
        "mean": 232.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 232.02,
        "sum_squared": 53833.2804,
        "min": 232.02,
        "max": 232.02,
        "mean": 232.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 834.58,
        "sum_squared": 696523.7764000001,
        "min": 834.58,
        "max": 834.58,
        "mean": 834.58,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 834.58,
        "sum_squared": 696523.7764000001,
        "min": 834.58,
        "max": 834.58,
        "mean": 834.58,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.47,
        "sum_squared": 0.22089999999999999,
        "min": 0.47,
        "max": 0.47,
        "mean": 0.47,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47,
        "sum_squared": 0.22089999999999999,
        "min": 0.47,
        "max": 0.47,
        "mean": 0.47,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47,
        "sum_squared": 0.22089999999999999,
        "min": 0.47,
        "max": 0.47,
        "mean": 0.47,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.5.0/bird_sql:model=google_gemini-1.5-flash-002",
    "run_spec": {
      "name": "bird_sql:model=google_gemini-1.5-flash-002",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.bird_sql_scenario.BIRDSQLScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 100,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/gemini-1.5-flash-002",
        "model": "google/gemini-1.5-flash-002",
        "temperature": 0.0,
        "max_tokens": 1024,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.bird_sql_metrics.BirdSQLMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "bird_sql"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.bird_sql_annotator.BirdSQLAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1181.01,
        "sum_squared": 1394784.6201,
        "min": 1181.01,
        "max": 1181.01,
        "mean": 1181.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.5261328601837159,
        "sum_squared": 2.3290815069325292,
        "min": 1.5261328601837159,
        "max": 1.5261328601837159,
        "mean": 1.5261328601837159,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1181.01,
        "sum_squared": 1394784.6201,
        "min": 1181.01,
        "max": 1181.01,
        "mean": 1181.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1181.01,
        "sum_squared": 1394784.6201,
        "min": 1181.01,
        "max": 1181.01,
        "mean": 1181.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.5261328601837159,
        "sum_squared": 2.3290815069325292,
        "min": 1.5261328601837159,
        "max": 1.5261328601837159,
        "mean": 1.5261328601837159,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.5261328601837159,
        "sum_squared": 2.3290815069325292,
        "min": 1.5261328601837159,
        "max": 1.5261328601837159,
        "mean": 1.5261328601837159,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.63,
        "sum_squared": 0.39690000000000003,
        "min": 0.63,
        "max": 0.63,
        "mean": 0.63,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.63,
        "sum_squared": 0.39690000000000003,
        "min": 0.63,
        "max": 0.63,
        "mean": 0.63,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.63,
        "sum_squared": 0.39690000000000003,
        "min": 0.63,
        "max": 0.63,
        "mean": 0.63,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.5.0/bird_sql:model=google_gemini-1.5-pro-002",
    "run_spec": {
      "name": "bird_sql:model=google_gemini-1.5-pro-002",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.bird_sql_scenario.BIRDSQLScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 100,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/gemini-1.5-pro-002",
        "model": "google/gemini-1.5-pro-002",
        "temperature": 0.0,
        "max_tokens": 1024,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.bird_sql_metrics.BirdSQLMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "bird_sql"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.bird_sql_annotator.BirdSQLAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1181.01,
        "sum_squared": 1394784.6201,
        "min": 1181.01,
        "max": 1181.01,
        "mean": 1181.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 3.0779216051101685,
        "sum_squared": 9.473601407203956,
        "min": 3.0779216051101685,
        "max": 3.0779216051101685,
        "mean": 3.0779216051101685,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1181.01,
        "sum_squared": 1394784.6201,
        "min": 1181.01,
        "max": 1181.01,
        "mean": 1181.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1181.01,
        "sum_squared": 1394784.6201,
        "min": 1181.01,
        "max": 1181.01,
        "mean": 1181.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.0779216051101685,
        "sum_squared": 9.473601407203956,
        "min": 3.0779216051101685,
        "max": 3.0779216051101685,
        "mean": 3.0779216051101685,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.0779216051101685,
        "sum_squared": 9.473601407203956,
        "min": 3.0779216051101685,
        "max": 3.0779216051101685,
        "mean": 3.0779216051101685,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.5.0/bird_sql:model=meta_llama-3.1-405b-instruct-turbo",
    "run_spec": {
      "name": "bird_sql:model=meta_llama-3.1-405b-instruct-turbo",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.bird_sql_scenario.BIRDSQLScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 100,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-3.1-405b-instruct-turbo",
        "model": "meta/llama-3.1-405b-instruct-turbo",
        "temperature": 0.0,
        "max_tokens": 1024,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.bird_sql_metrics.BirdSQLMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "bird_sql"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.bird_sql_annotator.BirdSQLAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1024.99,
        "sum_squared": 1050604.5001,
        "min": 1024.99,
        "max": 1024.99,
        "mean": 1024.99,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 213.94,
        "sum_squared": 45770.323599999996,
        "min": 213.94,
        "max": 213.94,
        "mean": 213.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 213.94,
        "sum_squared": 45770.323599999996,
        "min": 213.94,
        "max": 213.94,
        "mean": 213.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 3.2729275226593018,
        "sum_squared": 10.712054568580754,
        "min": 3.2729275226593018,
        "max": 3.2729275226593018,
        "mean": 3.2729275226593018,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": -24.506075831708618,
        "sum_squared": 600.5477526694532,
        "min": -24.506075831708618,
        "max": -24.506075831708618,
        "mean": -24.506075831708618,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 213.94,
        "sum_squared": 45770.323599999996,
        "min": 213.94,
        "max": 213.94,
        "mean": 213.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 879.81,
        "sum_squared": 774065.6360999999,
        "min": 879.81,
        "max": 879.81,
        "mean": 879.81,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.121364770848275,
        "sum_squared": 1.2574589492996044,
        "min": 1.121364770848275,
        "max": 1.121364770848275,
        "mean": 1.121364770848275,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.04018457857270877,
        "sum_squared": 0.0016148003550662047,
        "min": 0.04018457857270877,
        "max": 0.04018457857270877,
        "mean": 0.04018457857270877,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": -0.027853827339662674,
        "sum_squared": 0.0007758356974677398,
        "min": -0.027853827339662674,
        "max": -0.027853827339662674,
        "mean": -0.027853827339662674,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "ece_10_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "ece_1_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "selective_cov_acc_area",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "selective_acc@10",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "platt_ece_10_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "platt_ece_1_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1024.99,
        "sum_squared": 1050604.5001,
        "min": 1024.99,
        "max": 1024.99,
        "mean": 1024.99,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1024.99,
        "sum_squared": 1050604.5001,
        "min": 1024.99,
        "max": 1024.99,
        "mean": 1024.99,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 213.94,
        "sum_squared": 45770.323599999996,
        "min": 213.94,
        "max": 213.94,
        "mean": 213.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 213.94,
        "sum_squared": 45770.323599999996,
        "min": 213.94,
        "max": 213.94,
        "mean": 213.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 213.94,
        "sum_squared": 45770.323599999996,
        "min": 213.94,
        "max": 213.94,
        "mean": 213.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 213.94,
        "sum_squared": 45770.323599999996,
        "min": 213.94,
        "max": 213.94,
        "mean": 213.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.2729275226593018,
        "sum_squared": 10.712054568580754,
        "min": 3.2729275226593018,
        "max": 3.2729275226593018,
        "mean": 3.2729275226593018,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.2729275226593018,
        "sum_squared": 10.712054568580754,
        "min": 3.2729275226593018,
        "max": 3.2729275226593018,
        "mean": 3.2729275226593018,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -24.506075831708618,
        "sum_squared": 600.5477526694532,
        "min": -24.506075831708618,
        "max": -24.506075831708618,
        "mean": -24.506075831708618,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -24.506075831708618,
        "sum_squared": 600.5477526694532,
        "min": -24.506075831708618,
        "max": -24.506075831708618,
        "mean": -24.506075831708618,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 213.94,
        "sum_squared": 45770.323599999996,
        "min": 213.94,
        "max": 213.94,
        "mean": 213.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 213.94,
        "sum_squared": 45770.323599999996,
        "min": 213.94,
        "max": 213.94,
        "mean": 213.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 879.81,
        "sum_squared": 774065.6360999999,
        "min": 879.81,
        "max": 879.81,
        "mean": 879.81,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 879.81,
        "sum_squared": 774065.6360999999,
        "min": 879.81,
        "max": 879.81,
        "mean": 879.81,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.5.0/bird_sql:model=meta_llama-3.1-70b-instruct-turbo",
    "run_spec": {
      "name": "bird_sql:model=meta_llama-3.1-70b-instruct-turbo",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.bird_sql_scenario.BIRDSQLScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 100,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-3.1-70b-instruct-turbo",
        "model": "meta/llama-3.1-70b-instruct-turbo",
        "temperature": 0.0,
        "max_tokens": 1024,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.bird_sql_metrics.BirdSQLMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "bird_sql"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.bird_sql_annotator.BirdSQLAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1024.99,
        "sum_squared": 1050604.5001,
        "min": 1024.99,
        "max": 1024.99,
        "mean": 1024.99,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 222.9,
        "sum_squared": 49684.41,
        "min": 222.9,
        "max": 222.9,
        "mean": 222.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 222.9,
        "sum_squared": 49684.41,
        "min": 222.9,
        "max": 222.9,
        "mean": 222.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.6353313827514648,
        "sum_squared": 2.674308731411818,
        "min": 1.6353313827514648,
        "max": 1.6353313827514648,
        "mean": 1.6353313827514648,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": -32.465370822437855,
        "sum_squared": 1054.0003026383993,
        "min": -32.465370822437855,
        "max": -32.465370822437855,
        "mean": -32.465370822437855,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 222.9,
        "sum_squared": 49684.41,
        "min": 222.9,
        "max": 222.9,
        "mean": 222.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 880.0,
        "sum_squared": 774400.0,
        "min": 880.0,
        "max": 880.0,
        "mean": 880.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.1567911666772324,
        "sum_squared": 1.3381658033024724,
        "min": 1.1567911666772324,
        "max": 1.1567911666772324,
        "mean": 1.1567911666772324,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.053224578961536755,
        "sum_squared": 0.0028328558056328608,
        "min": 0.053224578961536755,
        "max": 0.053224578961536755,
        "mean": 0.053224578961536755,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": -0.03689246684367938,
        "sum_squared": 0.0013610541098119826,
        "min": -0.03689246684367938,
        "max": -0.03689246684367938,
        "mean": -0.03689246684367938,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "ece_10_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "ece_1_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "selective_cov_acc_area",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "selective_acc@10",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "platt_ece_10_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "platt_ece_1_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1024.99,
        "sum_squared": 1050604.5001,
        "min": 1024.99,
        "max": 1024.99,
        "mean": 1024.99,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1024.99,
        "sum_squared": 1050604.5001,
        "min": 1024.99,
        "max": 1024.99,
        "mean": 1024.99,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 222.9,
        "sum_squared": 49684.41,
        "min": 222.9,
        "max": 222.9,
        "mean": 222.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 222.9,
        "sum_squared": 49684.41,
        "min": 222.9,
        "max": 222.9,
        "mean": 222.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 222.9,
        "sum_squared": 49684.41,
        "min": 222.9,
        "max": 222.9,
        "mean": 222.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 222.9,
        "sum_squared": 49684.41,
        "min": 222.9,
        "max": 222.9,
        "mean": 222.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.6353313827514648,
        "sum_squared": 2.674308731411818,
        "min": 1.6353313827514648,
        "max": 1.6353313827514648,
        "mean": 1.6353313827514648,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.6353313827514648,
        "sum_squared": 2.674308731411818,
        "min": 1.6353313827514648,
        "max": 1.6353313827514648,
        "mean": 1.6353313827514648,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -32.465370822437855,
        "sum_squared": 1054.0003026383993,
        "min": -32.465370822437855,
        "max": -32.465370822437855,
        "mean": -32.465370822437855,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -32.465370822437855,
        "sum_squared": 1054.0003026383993,
        "min": -32.465370822437855,
        "max": -32.465370822437855,
        "mean": -32.465370822437855,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 222.9,
        "sum_squared": 49684.41,
        "min": 222.9,
        "max": 222.9,
        "mean": 222.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 222.9,
        "sum_squared": 49684.41,
        "min": 222.9,
        "max": 222.9,
        "mean": 222.9,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 880.0,
        "sum_squared": 774400.0,
        "min": 880.0,
        "max": 880.0,
        "mean": 880.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 880.0,
        "sum_squared": 774400.0,
        "min": 880.0,
        "max": 880.0,
        "mean": 880.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.55,
        "sum_squared": 0.30250000000000005,
        "min": 0.55,
        "max": 0.55,
        "mean": 0.55,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.5.0/bird_sql:model=meta_llama-3.1-8b-instruct-turbo",
    "run_spec": {
      "name": "bird_sql:model=meta_llama-3.1-8b-instruct-turbo",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.bird_sql_scenario.BIRDSQLScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 100,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-3.1-8b-instruct-turbo",
        "model": "meta/llama-3.1-8b-instruct-turbo",
        "temperature": 0.0,
        "max_tokens": 1024,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.bird_sql_metrics.BirdSQLMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "bird_sql"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.bird_sql_annotator.BirdSQLAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1024.99,
        "sum_squared": 1050604.5001,
        "min": 1024.99,
        "max": 1024.99,
        "mean": 1024.99,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 224.91,
        "sum_squared": 50584.5081,
        "min": 224.91,
        "max": 224.91,
        "mean": 224.91,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 224.91,
        "sum_squared": 50584.5081,
        "min": 224.91,
        "max": 224.91,
        "mean": 224.91,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.6573841714859008,
        "sum_squared": 2.7469222918920058,
        "min": 1.6573841714859008,
        "max": 1.6573841714859008,
        "mean": 1.6573841714859008,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": -35.03595062195679,
        "sum_squared": 1227.5178359841946,
        "min": -35.03595062195679,
        "max": -35.03595062195679,
        "mean": -35.03595062195679,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 224.91,
        "sum_squared": 50584.5081,
        "min": 224.91,
        "max": 224.91,
        "mean": 224.91,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 889.72,
        "sum_squared": 791601.6784000001,
        "min": 889.72,
        "max": 889.72,
        "mean": 889.72,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.1685663401516966,
        "sum_squared": 1.3655472913355307,
        "min": 1.1685663401516966,
        "max": 1.1685663401516966,
        "mean": 1.1685663401516966,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.05681134763198271,
        "sum_squared": 0.003227529219761987,
        "min": 0.05681134763198271,
        "max": 0.05681134763198271,
        "mean": 0.05681134763198271,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": -0.03937862543491974,
        "sum_squared": 0.0015506761411437078,
        "min": -0.03937862543491974,
        "max": -0.03937862543491974,
        "mean": -0.03937862543491974,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "ece_10_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "ece_1_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "selective_cov_acc_area",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "selective_acc@10",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "platt_ece_10_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "platt_ece_1_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1024.99,
        "sum_squared": 1050604.5001,
        "min": 1024.99,
        "max": 1024.99,
        "mean": 1024.99,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1024.99,
        "sum_squared": 1050604.5001,
        "min": 1024.99,
        "max": 1024.99,
        "mean": 1024.99,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 224.91,
        "sum_squared": 50584.5081,
        "min": 224.91,
        "max": 224.91,
        "mean": 224.91,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 224.91,
        "sum_squared": 50584.5081,
        "min": 224.91,
        "max": 224.91,
        "mean": 224.91,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 224.91,
        "sum_squared": 50584.5081,
        "min": 224.91,
        "max": 224.91,
        "mean": 224.91,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 224.91,
        "sum_squared": 50584.5081,
        "min": 224.91,
        "max": 224.91,
        "mean": 224.91,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.6573841714859008,
        "sum_squared": 2.7469222918920058,
        "min": 1.6573841714859008,
        "max": 1.6573841714859008,
        "mean": 1.6573841714859008,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.6573841714859008,
        "sum_squared": 2.7469222918920058,
        "min": 1.6573841714859008,
        "max": 1.6573841714859008,
        "mean": 1.6573841714859008,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -35.03595062195679,
        "sum_squared": 1227.5178359841946,
        "min": -35.03595062195679,
        "max": -35.03595062195679,
        "mean": -35.03595062195679,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -35.03595062195679,
        "sum_squared": 1227.5178359841946,
        "min": -35.03595062195679,
        "max": -35.03595062195679,
        "mean": -35.03595062195679,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 224.91,
        "sum_squared": 50584.5081,
        "min": 224.91,
        "max": 224.91,
        "mean": 224.91,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 224.91,
        "sum_squared": 50584.5081,
        "min": 224.91,
        "max": 224.91,
        "mean": 224.91,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 889.72,
        "sum_squared": 791601.6784000001,
        "min": 889.72,
        "max": 889.72,
        "mean": 889.72,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 889.72,
        "sum_squared": 791601.6784000001,
        "min": 889.72,
        "max": 889.72,
        "mean": 889.72,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.37,
        "sum_squared": 0.1369,
        "min": 0.37,
        "max": 0.37,
        "mean": 0.37,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.37,
        "sum_squared": 0.1369,
        "min": 0.37,
        "max": 0.37,
        "mean": 0.37,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.37,
        "sum_squared": 0.1369,
        "min": 0.37,
        "max": 0.37,
        "mean": 0.37,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.5.0/bird_sql:model=openai_gpt-4o-2024-08-06",
    "run_spec": {
      "name": "bird_sql:model=openai_gpt-4o-2024-08-06",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.bird_sql_scenario.BIRDSQLScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 100,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-4o-2024-08-06",
        "model": "openai/gpt-4o-2024-08-06",
        "temperature": 0.0,
        "max_tokens": 1024,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.bird_sql_metrics.BirdSQLMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "bird_sql"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.bird_sql_annotator.BirdSQLAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1030.82,
        "sum_squared": 1062589.8723999998,
        "min": 1030.82,
        "max": 1030.82,
        "mean": 1030.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 240.35,
        "sum_squared": 57768.1225,
        "min": 240.35,
        "max": 240.35,
        "mean": 240.35,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 240.35,
        "sum_squared": 57768.1225,
        "min": 240.35,
        "max": 240.35,
        "mean": 240.35,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 6.375856232643128,
        "sum_squared": 40.651542699334215,
        "min": 6.375856232643128,
        "max": 6.375856232643128,
        "mean": 6.375856232643128,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 240.35,
        "sum_squared": 57768.1225,
        "min": 240.35,
        "max": 240.35,
        "mean": 240.35,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 964.94,
        "sum_squared": 931109.2036000001,
        "min": 964.94,
        "max": 964.94,
        "mean": 964.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1030.82,
        "sum_squared": 1062589.8723999998,
        "min": 1030.82,
        "max": 1030.82,
        "mean": 1030.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1030.82,
        "sum_squared": 1062589.8723999998,
        "min": 1030.82,
        "max": 1030.82,
        "mean": 1030.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 240.35,
        "sum_squared": 57768.1225,
        "min": 240.35,
        "max": 240.35,
        "mean": 240.35,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 240.35,
        "sum_squared": 57768.1225,
        "min": 240.35,
        "max": 240.35,
        "mean": 240.35,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 240.35,
        "sum_squared": 57768.1225,
        "min": 240.35,
        "max": 240.35,
        "mean": 240.35,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 240.35,
        "sum_squared": 57768.1225,
        "min": 240.35,
        "max": 240.35,
        "mean": 240.35,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 6.375856232643128,
        "sum_squared": 40.651542699334215,
        "min": 6.375856232643128,
        "max": 6.375856232643128,
        "mean": 6.375856232643128,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 6.375856232643128,
        "sum_squared": 40.651542699334215,
        "min": 6.375856232643128,
        "max": 6.375856232643128,
        "mean": 6.375856232643128,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 240.35,
        "sum_squared": 57768.1225,
        "min": 240.35,
        "max": 240.35,
        "mean": 240.35,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 240.35,
        "sum_squared": 57768.1225,
        "min": 240.35,
        "max": 240.35,
        "mean": 240.35,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 964.94,
        "sum_squared": 931109.2036000001,
        "min": 964.94,
        "max": 964.94,
        "mean": 964.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 964.94,
        "sum_squared": 931109.2036000001,
        "min": 964.94,
        "max": 964.94,
        "mean": 964.94,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.62,
        "sum_squared": 0.3844,
        "min": 0.62,
        "max": 0.62,
        "mean": 0.62,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.5.0/bird_sql:model=openai_gpt-4o-mini-2024-07-18",
    "run_spec": {
      "name": "bird_sql:model=openai_gpt-4o-mini-2024-07-18",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.bird_sql_scenario.BIRDSQLScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 100,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-4o-mini-2024-07-18",
        "model": "openai/gpt-4o-mini-2024-07-18",
        "temperature": 0.0,
        "max_tokens": 1024,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.bird_sql_metrics.BirdSQLMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "bird_sql"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.bird_sql_annotator.BirdSQLAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 1030.82,
        "sum_squared": 1062589.8723999998,
        "min": 1030.82,
        "max": 1030.82,
        "mean": 1030.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 227.43,
        "sum_squared": 51724.4049,
        "min": 227.43,
        "max": 227.43,
        "mean": 227.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 227.43,
        "sum_squared": 51724.4049,
        "min": 227.43,
        "max": 227.43,
        "mean": 227.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 3.5940049839019776,
        "sum_squared": 12.916871824312254,
        "min": 3.5940049839019776,
        "max": 3.5940049839019776,
        "mean": 3.5940049839019776,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 227.43,
        "sum_squared": 51724.4049,
        "min": 227.43,
        "max": 227.43,
        "mean": 227.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 901.29,
        "sum_squared": 812323.6640999999,
        "min": 901.29,
        "max": 901.29,
        "mean": 901.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1030.82,
        "sum_squared": 1062589.8723999998,
        "min": 1030.82,
        "max": 1030.82,
        "mean": 1030.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1030.82,
        "sum_squared": 1062589.8723999998,
        "min": 1030.82,
        "max": 1030.82,
        "mean": 1030.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 227.43,
        "sum_squared": 51724.4049,
        "min": 227.43,
        "max": 227.43,
        "mean": 227.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 227.43,
        "sum_squared": 51724.4049,
        "min": 227.43,
        "max": 227.43,
        "mean": 227.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 227.43,
        "sum_squared": 51724.4049,
        "min": 227.43,
        "max": 227.43,
        "mean": 227.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 227.43,
        "sum_squared": 51724.4049,
        "min": 227.43,
        "max": 227.43,
        "mean": 227.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.5940049839019776,
        "sum_squared": 12.916871824312254,
        "min": 3.5940049839019776,
        "max": 3.5940049839019776,
        "mean": 3.5940049839019776,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.5940049839019776,
        "sum_squared": 12.916871824312254,
        "min": 3.5940049839019776,
        "max": 3.5940049839019776,
        "mean": 3.5940049839019776,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 227.43,
        "sum_squared": 51724.4049,
        "min": 227.43,
        "max": 227.43,
        "mean": 227.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 227.43,
        "sum_squared": 51724.4049,
        "min": 227.43,
        "max": 227.43,
        "mean": 227.43,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 901.29,
        "sum_squared": 812323.6640999999,
        "min": 901.29,
        "max": 901.29,
        "mean": 901.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 901.29,
        "sum_squared": 812323.6640999999,
        "min": 901.29,
        "max": 901.29,
        "mean": 901.29,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.54,
        "sum_squared": 0.2916,
        "min": 0.54,
        "max": 0.54,
        "mean": 0.54,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.5.0/spider:model=anthropic_claude-3-5-haiku-20241022",
    "run_spec": {
      "name": "spider:model=anthropic_claude-3-5-haiku-20241022",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.spider_scenario.SpiderScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 100,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-3-5-haiku-20241022",
        "model": "anthropic/claude-3-5-haiku-20241022",
        "temperature": 0.0,
        "max_tokens": 1024,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.spider_metrics.SpiderMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "spider"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.spider_annotator.SpiderAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 454.04,
        "sum_squared": 206152.32160000002,
        "min": 454.04,
        "max": 454.04,
        "mean": 454.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 143.6,
        "sum_squared": 20620.96,
        "min": 143.6,
        "max": 143.6,
        "mean": 143.6,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 143.6,
        "sum_squared": 20620.96,
        "min": 143.6,
        "max": 143.6,
        "mean": 143.6,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 3.0672072863578794,
        "sum_squared": 9.407760537486867,
        "min": 3.0672072863578794,
        "max": 3.0672072863578794,
        "mean": 3.0672072863578794,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 143.6,
        "sum_squared": 20620.96,
        "min": 143.6,
        "max": 143.6,
        "mean": 143.6,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 523.79,
        "sum_squared": 274355.9641,
        "min": 523.79,
        "max": 523.79,
        "mean": 523.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 454.04,
        "sum_squared": 206152.32160000002,
        "min": 454.04,
        "max": 454.04,
        "mean": 454.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 454.04,
        "sum_squared": 206152.32160000002,
        "min": 454.04,
        "max": 454.04,
        "mean": 454.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 143.6,
        "sum_squared": 20620.96,
        "min": 143.6,
        "max": 143.6,
        "mean": 143.6,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 143.6,
        "sum_squared": 20620.96,
        "min": 143.6,
        "max": 143.6,
        "mean": 143.6,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 143.6,
        "sum_squared": 20620.96,
        "min": 143.6,
        "max": 143.6,
        "mean": 143.6,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 143.6,
        "sum_squared": 20620.96,
        "min": 143.6,
        "max": 143.6,
        "mean": 143.6,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.0672072863578794,
        "sum_squared": 9.407760537486867,
        "min": 3.0672072863578794,
        "max": 3.0672072863578794,
        "mean": 3.0672072863578794,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.0672072863578794,
        "sum_squared": 9.407760537486867,
        "min": 3.0672072863578794,
        "max": 3.0672072863578794,
        "mean": 3.0672072863578794,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 143.6,
        "sum_squared": 20620.96,
        "min": 143.6,
        "max": 143.6,
        "mean": 143.6,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 143.6,
        "sum_squared": 20620.96,
        "min": 143.6,
        "max": 143.6,
        "mean": 143.6,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 523.79,
        "sum_squared": 274355.9641,
        "min": 523.79,
        "max": 523.79,
        "mean": 523.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 523.79,
        "sum_squared": 274355.9641,
        "min": 523.79,
        "max": 523.79,
        "mean": 523.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.63,
        "sum_squared": 0.39690000000000003,
        "min": 0.63,
        "max": 0.63,
        "mean": 0.63,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.63,
        "sum_squared": 0.39690000000000003,
        "min": 0.63,
        "max": 0.63,
        "mean": 0.63,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.63,
        "sum_squared": 0.39690000000000003,
        "min": 0.63,
        "max": 0.63,
        "mean": 0.63,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.5.0/spider:model=anthropic_claude-3-5-sonnet-20240620",
    "run_spec": {
      "name": "spider:model=anthropic_claude-3-5-sonnet-20240620",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.spider_scenario.SpiderScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 100,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-3-5-sonnet-20240620",
        "model": "anthropic/claude-3-5-sonnet-20240620",
        "temperature": 0.0,
        "max_tokens": 1024,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.spider_metrics.SpiderMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "spider"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.spider_annotator.SpiderAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 454.04,
        "sum_squared": 206152.32160000002,
        "min": 454.04,
        "max": 454.04,
        "mean": 454.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 184.57,
        "sum_squared": 34066.084899999994,
        "min": 184.57,
        "max": 184.57,
        "mean": 184.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 184.57,
        "sum_squared": 34066.084899999994,
        "min": 184.57,
        "max": 184.57,
        "mean": 184.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 3.4905085706710817,
        "sum_squared": 12.183650081928278,
        "min": 3.4905085706710817,
        "max": 3.4905085706710817,
        "mean": 3.4905085706710817,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 184.57,
        "sum_squared": 34066.084899999994,
        "min": 184.57,
        "max": 184.57,
        "mean": 184.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 695.53,
        "sum_squared": 483761.98089999997,
        "min": 695.53,
        "max": 695.53,
        "mean": 695.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 454.04,
        "sum_squared": 206152.32160000002,
        "min": 454.04,
        "max": 454.04,
        "mean": 454.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 454.04,
        "sum_squared": 206152.32160000002,
        "min": 454.04,
        "max": 454.04,
        "mean": 454.04,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 184.57,
        "sum_squared": 34066.084899999994,
        "min": 184.57,
        "max": 184.57,
        "mean": 184.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 184.57,
        "sum_squared": 34066.084899999994,
        "min": 184.57,
        "max": 184.57,
        "mean": 184.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 184.57,
        "sum_squared": 34066.084899999994,
        "min": 184.57,
        "max": 184.57,
        "mean": 184.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 184.57,
        "sum_squared": 34066.084899999994,
        "min": 184.57,
        "max": 184.57,
        "mean": 184.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.4905085706710817,
        "sum_squared": 12.183650081928278,
        "min": 3.4905085706710817,
        "max": 3.4905085706710817,
        "mean": 3.4905085706710817,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.4905085706710817,
        "sum_squared": 12.183650081928278,
        "min": 3.4905085706710817,
        "max": 3.4905085706710817,
        "mean": 3.4905085706710817,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 184.57,
        "sum_squared": 34066.084899999994,
        "min": 184.57,
        "max": 184.57,
        "mean": 184.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 184.57,
        "sum_squared": 34066.084899999994,
        "min": 184.57,
        "max": 184.57,
        "mean": 184.57,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 695.53,
        "sum_squared": 483761.98089999997,
        "min": 695.53,
        "max": 695.53,
        "mean": 695.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 695.53,
        "sum_squared": 483761.98089999997,
        "min": 695.53,
        "max": 695.53,
        "mean": 695.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.66,
        "sum_squared": 0.43560000000000004,
        "min": 0.66,
        "max": 0.66,
        "mean": 0.66,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.66,
        "sum_squared": 0.43560000000000004,
        "min": 0.66,
        "max": 0.66,
        "mean": 0.66,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.66,
        "sum_squared": 0.43560000000000004,
        "min": 0.66,
        "max": 0.66,
        "mean": 0.66,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.5.0/spider:model=google_gemini-1.5-flash-002",
    "run_spec": {
      "name": "spider:model=google_gemini-1.5-flash-002",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.spider_scenario.SpiderScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 100,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/gemini-1.5-flash-002",
        "model": "google/gemini-1.5-flash-002",
        "temperature": 0.0,
        "max_tokens": 1024,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.spider_metrics.SpiderMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "spider"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.spider_annotator.SpiderAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 443.55,
        "sum_squared": 196736.6025,
        "min": 443.55,
        "max": 443.55,
        "mean": 443.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.2284269905090333,
        "sum_squared": 1.5090328710110805,
        "min": 1.2284269905090333,
        "max": 1.2284269905090333,
        "mean": 1.2284269905090333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 443.55,
        "sum_squared": 196736.6025,
        "min": 443.55,
        "max": 443.55,
        "mean": 443.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 443.55,
        "sum_squared": 196736.6025,
        "min": 443.55,
        "max": 443.55,
        "mean": 443.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.2284269905090333,
        "sum_squared": 1.5090328710110805,
        "min": 1.2284269905090333,
        "max": 1.2284269905090333,
        "mean": 1.2284269905090333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.2284269905090333,
        "sum_squared": 1.5090328710110805,
        "min": 1.2284269905090333,
        "max": 1.2284269905090333,
        "mean": 1.2284269905090333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.79,
        "sum_squared": 0.6241000000000001,
        "min": 0.79,
        "max": 0.79,
        "mean": 0.79,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.5.0/spider:model=google_gemini-1.5-pro-002",
    "run_spec": {
      "name": "spider:model=google_gemini-1.5-pro-002",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.spider_scenario.SpiderScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 100,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/gemini-1.5-pro-002",
        "model": "google/gemini-1.5-pro-002",
        "temperature": 0.0,
        "max_tokens": 1024,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.spider_metrics.SpiderMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "spider"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.spider_annotator.SpiderAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 443.55,
        "sum_squared": 196736.6025,
        "min": 443.55,
        "max": 443.55,
        "mean": 443.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.9179282283782957,
        "sum_squared": 8.5143051459669,
        "min": 2.9179282283782957,
        "max": 2.9179282283782957,
        "mean": 2.9179282283782957,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 443.55,
        "sum_squared": 196736.6025,
        "min": 443.55,
        "max": 443.55,
        "mean": 443.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 443.55,
        "sum_squared": 196736.6025,
        "min": 443.55,
        "max": 443.55,
        "mean": 443.55,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.9179282283782957,
        "sum_squared": 8.5143051459669,
        "min": 2.9179282283782957,
        "max": 2.9179282283782957,
        "mean": 2.9179282283782957,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.9179282283782957,
        "sum_squared": 8.5143051459669,
        "min": 2.9179282283782957,
        "max": 2.9179282283782957,
        "mean": 2.9179282283782957,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8,
        "sum_squared": 0.6400000000000001,
        "min": 0.8,
        "max": 0.8,
        "mean": 0.8,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.5.0/spider:model=meta_llama-3.1-405b-instruct-turbo",
    "run_spec": {
      "name": "spider:model=meta_llama-3.1-405b-instruct-turbo",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.spider_scenario.SpiderScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 100,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-3.1-405b-instruct-turbo",
        "model": "meta/llama-3.1-405b-instruct-turbo",
        "temperature": 0.0,
        "max_tokens": 1024,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.spider_metrics.SpiderMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "spider"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.spider_annotator.SpiderAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 365.01,
        "sum_squared": 133232.3001,
        "min": 365.01,
        "max": 365.01,
        "mean": 365.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 168.06,
        "sum_squared": 28244.1636,
        "min": 168.06,
        "max": 168.06,
        "mean": 168.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 168.06,
        "sum_squared": 28244.1636,
        "min": 168.06,
        "max": 168.06,
        "mean": 168.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 2.348685576915741,
        "sum_squared": 5.516323939212027,
        "min": 2.348685576915741,
        "max": 2.348685576915741,
        "mean": 2.348685576915741,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": -19.5075708039309,
        "sum_squared": 380.54531867037724,
        "min": -19.5075708039309,
        "max": -19.5075708039309,
        "mean": -19.5075708039309,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 168.06,
        "sum_squared": 28244.1636,
        "min": 168.06,
        "max": 168.06,
        "mean": 168.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 739.75,
        "sum_squared": 547230.0625,
        "min": 739.75,
        "max": 739.75,
        "mean": 739.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.123080142108907,
        "sum_squared": 1.2613090055993625,
        "min": 1.123080142108907,
        "max": 1.123080142108907,
        "mean": 1.123080142108907,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.03804457676055619,
        "sum_squared": 0.0014473898208898523,
        "min": 0.03804457676055619,
        "max": 0.03804457676055619,
        "mean": 0.03804457676055619,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": -0.02637049111717594,
        "sum_squared": 0.000695402801761055,
        "min": -0.02637049111717594,
        "max": -0.02637049111717594,
        "mean": -0.02637049111717594,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "ece_10_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "ece_1_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "selective_cov_acc_area",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "selective_acc@10",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "platt_ece_10_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "platt_ece_1_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 365.01,
        "sum_squared": 133232.3001,
        "min": 365.01,
        "max": 365.01,
        "mean": 365.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 365.01,
        "sum_squared": 133232.3001,
        "min": 365.01,
        "max": 365.01,
        "mean": 365.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 168.06,
        "sum_squared": 28244.1636,
        "min": 168.06,
        "max": 168.06,
        "mean": 168.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 168.06,
        "sum_squared": 28244.1636,
        "min": 168.06,
        "max": 168.06,
        "mean": 168.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 168.06,
        "sum_squared": 28244.1636,
        "min": 168.06,
        "max": 168.06,
        "mean": 168.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 168.06,
        "sum_squared": 28244.1636,
        "min": 168.06,
        "max": 168.06,
        "mean": 168.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.348685576915741,
        "sum_squared": 5.516323939212027,
        "min": 2.348685576915741,
        "max": 2.348685576915741,
        "mean": 2.348685576915741,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.348685576915741,
        "sum_squared": 5.516323939212027,
        "min": 2.348685576915741,
        "max": 2.348685576915741,
        "mean": 2.348685576915741,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -19.5075708039309,
        "sum_squared": 380.54531867037724,
        "min": -19.5075708039309,
        "max": -19.5075708039309,
        "mean": -19.5075708039309,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -19.5075708039309,
        "sum_squared": 380.54531867037724,
        "min": -19.5075708039309,
        "max": -19.5075708039309,
        "mean": -19.5075708039309,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 168.06,
        "sum_squared": 28244.1636,
        "min": 168.06,
        "max": 168.06,
        "mean": 168.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 168.06,
        "sum_squared": 28244.1636,
        "min": 168.06,
        "max": 168.06,
        "mean": 168.06,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 739.75,
        "sum_squared": 547230.0625,
        "min": 739.75,
        "max": 739.75,
        "mean": 739.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 739.75,
        "sum_squared": 547230.0625,
        "min": 739.75,
        "max": 739.75,
        "mean": 739.75,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.77,
        "sum_squared": 0.5929,
        "min": 0.77,
        "max": 0.77,
        "mean": 0.77,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.77,
        "sum_squared": 0.5929,
        "min": 0.77,
        "max": 0.77,
        "mean": 0.77,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.77,
        "sum_squared": 0.5929,
        "min": 0.77,
        "max": 0.77,
        "mean": 0.77,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.5.0/spider:model=meta_llama-3.1-70b-instruct-turbo",
    "run_spec": {
      "name": "spider:model=meta_llama-3.1-70b-instruct-turbo",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.spider_scenario.SpiderScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 100,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-3.1-70b-instruct-turbo",
        "model": "meta/llama-3.1-70b-instruct-turbo",
        "temperature": 0.0,
        "max_tokens": 1024,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.spider_metrics.SpiderMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "spider"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.spider_annotator.SpiderAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 365.01,
        "sum_squared": 133232.3001,
        "min": 365.01,
        "max": 365.01,
        "mean": 365.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 165.82,
        "sum_squared": 27496.272399999998,
        "min": 165.82,
        "max": 165.82,
        "mean": 165.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 165.82,
        "sum_squared": 27496.272399999998,
        "min": 165.82,
        "max": 165.82,
        "mean": 165.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.1871030712127686,
        "sum_squared": 1.4092137016827875,
        "min": 1.1871030712127686,
        "max": 1.1871030712127686,
        "mean": 1.1871030712127686,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": -24.784525527489464,
        "sum_squared": 614.2727056227769,
        "min": -24.784525527489464,
        "max": -24.784525527489464,
        "mean": -24.784525527489464,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 165.82,
        "sum_squared": 27496.272399999998,
        "min": 165.82,
        "max": 165.82,
        "mean": 165.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 708.0,
        "sum_squared": 501264.0,
        "min": 708.0,
        "max": 708.0,
        "mean": 708.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.1612145029275587,
        "sum_squared": 1.3484191218092971,
        "min": 1.1612145029275587,
        "max": 1.1612145029275587,
        "mean": 1.1612145029275587,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.05050354812047312,
        "sum_squared": 0.002550608372756944,
        "min": 0.05050354812047312,
        "max": 0.05050354812047312,
        "mean": 0.05050354812047312,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": -0.03500639198797947,
        "sum_squared": 0.001225447480016073,
        "min": -0.03500639198797947,
        "max": -0.03500639198797947,
        "mean": -0.03500639198797947,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "ece_10_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "ece_1_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "selective_cov_acc_area",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "selective_acc@10",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "platt_ece_10_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "platt_ece_1_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 365.01,
        "sum_squared": 133232.3001,
        "min": 365.01,
        "max": 365.01,
        "mean": 365.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 365.01,
        "sum_squared": 133232.3001,
        "min": 365.01,
        "max": 365.01,
        "mean": 365.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 165.82,
        "sum_squared": 27496.272399999998,
        "min": 165.82,
        "max": 165.82,
        "mean": 165.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 165.82,
        "sum_squared": 27496.272399999998,
        "min": 165.82,
        "max": 165.82,
        "mean": 165.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 165.82,
        "sum_squared": 27496.272399999998,
        "min": 165.82,
        "max": 165.82,
        "mean": 165.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 165.82,
        "sum_squared": 27496.272399999998,
        "min": 165.82,
        "max": 165.82,
        "mean": 165.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.1871030712127686,
        "sum_squared": 1.4092137016827875,
        "min": 1.1871030712127686,
        "max": 1.1871030712127686,
        "mean": 1.1871030712127686,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.1871030712127686,
        "sum_squared": 1.4092137016827875,
        "min": 1.1871030712127686,
        "max": 1.1871030712127686,
        "mean": 1.1871030712127686,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -24.784525527489464,
        "sum_squared": 614.2727056227769,
        "min": -24.784525527489464,
        "max": -24.784525527489464,
        "mean": -24.784525527489464,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -24.784525527489464,
        "sum_squared": 614.2727056227769,
        "min": -24.784525527489464,
        "max": -24.784525527489464,
        "mean": -24.784525527489464,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 165.82,
        "sum_squared": 27496.272399999998,
        "min": 165.82,
        "max": 165.82,
        "mean": 165.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 165.82,
        "sum_squared": 27496.272399999998,
        "min": 165.82,
        "max": 165.82,
        "mean": 165.82,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 708.0,
        "sum_squared": 501264.0,
        "min": 708.0,
        "max": 708.0,
        "mean": 708.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 708.0,
        "sum_squared": 501264.0,
        "min": 708.0,
        "max": 708.0,
        "mean": 708.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.71,
        "sum_squared": 0.5041,
        "min": 0.71,
        "max": 0.71,
        "mean": 0.71,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.71,
        "sum_squared": 0.5041,
        "min": 0.71,
        "max": 0.71,
        "mean": 0.71,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.71,
        "sum_squared": 0.5041,
        "min": 0.71,
        "max": 0.71,
        "mean": 0.71,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.5.0/spider:model=meta_llama-3.1-8b-instruct-turbo",
    "run_spec": {
      "name": "spider:model=meta_llama-3.1-8b-instruct-turbo",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.spider_scenario.SpiderScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 100,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-3.1-8b-instruct-turbo",
        "model": "meta/llama-3.1-8b-instruct-turbo",
        "temperature": 0.0,
        "max_tokens": 1024,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.spider_metrics.SpiderMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "spider"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.spider_annotator.SpiderAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 365.01,
        "sum_squared": 133232.3001,
        "min": 365.01,
        "max": 365.01,
        "mean": 365.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 169.2,
        "sum_squared": 28628.639999999996,
        "min": 169.2,
        "max": 169.2,
        "mean": 169.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 169.2,
        "sum_squared": 28628.639999999996,
        "min": 169.2,
        "max": 169.2,
        "mean": 169.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.5893545317649841,
        "sum_squared": 0.34733876411192366,
        "min": 0.5893545317649841,
        "max": 0.5893545317649841,
        "mean": 0.5893545317649841,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": -29.04223062674154,
        "sum_squared": 843.4511597768442,
        "min": -29.04223062674154,
        "max": -29.04223062674154,
        "mean": -29.04223062674154,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 169.2,
        "sum_squared": 28628.639999999996,
        "min": 169.2,
        "max": 169.2,
        "mean": 169.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 710.21,
        "sum_squared": 504398.24410000007,
        "min": 710.21,
        "max": 710.21,
        "mean": 710.21,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.187255557055167,
        "sum_squared": 1.409575757758375,
        "min": 1.187255557055167,
        "max": 1.187255557055167,
        "mean": 1.187255557055167,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.05899534236571379,
        "sum_squared": 0.0034804504208477845,
        "min": 0.05899534236571379,
        "max": 0.05899534236571379,
        "mean": 0.05899534236571379,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": -0.040892455226963206,
        "sum_squared": 0.0016721928944891905,
        "min": -0.040892455226963206,
        "max": -0.040892455226963206,
        "mean": -0.040892455226963206,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "ece_10_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "ece_1_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "selective_cov_acc_area",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "selective_acc@10",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "platt_ece_10_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "platt_ece_1_bin",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 365.01,
        "sum_squared": 133232.3001,
        "min": 365.01,
        "max": 365.01,
        "mean": 365.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 365.01,
        "sum_squared": 133232.3001,
        "min": 365.01,
        "max": 365.01,
        "mean": 365.01,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.2,
        "sum_squared": 28628.639999999996,
        "min": 169.2,
        "max": 169.2,
        "mean": 169.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.2,
        "sum_squared": 28628.639999999996,
        "min": 169.2,
        "max": 169.2,
        "mean": 169.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.2,
        "sum_squared": 28628.639999999996,
        "min": 169.2,
        "max": 169.2,
        "mean": 169.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.2,
        "sum_squared": 28628.639999999996,
        "min": 169.2,
        "max": 169.2,
        "mean": 169.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5893545317649841,
        "sum_squared": 0.34733876411192366,
        "min": 0.5893545317649841,
        "max": 0.5893545317649841,
        "mean": 0.5893545317649841,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5893545317649841,
        "sum_squared": 0.34733876411192366,
        "min": 0.5893545317649841,
        "max": 0.5893545317649841,
        "mean": 0.5893545317649841,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -29.04223062674154,
        "sum_squared": 843.4511597768442,
        "min": -29.04223062674154,
        "max": -29.04223062674154,
        "mean": -29.04223062674154,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -29.04223062674154,
        "sum_squared": 843.4511597768442,
        "min": -29.04223062674154,
        "max": -29.04223062674154,
        "mean": -29.04223062674154,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.2,
        "sum_squared": 28628.639999999996,
        "min": 169.2,
        "max": 169.2,
        "mean": 169.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.2,
        "sum_squared": 28628.639999999996,
        "min": 169.2,
        "max": 169.2,
        "mean": 169.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 710.21,
        "sum_squared": 504398.24410000007,
        "min": 710.21,
        "max": 710.21,
        "mean": 710.21,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 710.21,
        "sum_squared": 504398.24410000007,
        "min": 710.21,
        "max": 710.21,
        "mean": 710.21,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.61,
        "sum_squared": 0.3721,
        "min": 0.61,
        "max": 0.61,
        "mean": 0.61,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.5.0/spider:model=openai_gpt-4o-2024-08-06",
    "run_spec": {
      "name": "spider:model=openai_gpt-4o-2024-08-06",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.spider_scenario.SpiderScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 100,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-4o-2024-08-06",
        "model": "openai/gpt-4o-2024-08-06",
        "temperature": 0.0,
        "max_tokens": 1024,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.spider_metrics.SpiderMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "spider"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.spider_annotator.SpiderAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 367.53,
        "sum_squared": 135078.30089999997,
        "min": 367.53,
        "max": 367.53,
        "mean": 367.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 183.2,
        "sum_squared": 33562.24,
        "min": 183.2,
        "max": 183.2,
        "mean": 183.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 183.2,
        "sum_squared": 33562.24,
        "min": 183.2,
        "max": 183.2,
        "mean": 183.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 4.70607540845871,
        "sum_squared": 22.147145750099813,
        "min": 4.70607540845871,
        "max": 4.70607540845871,
        "mean": 4.70607540845871,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 183.2,
        "sum_squared": 33562.24,
        "min": 183.2,
        "max": 183.2,
        "mean": 183.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 794.18,
        "sum_squared": 630721.8723999999,
        "min": 794.18,
        "max": 794.18,
        "mean": 794.18,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 367.53,
        "sum_squared": 135078.30089999997,
        "min": 367.53,
        "max": 367.53,
        "mean": 367.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 367.53,
        "sum_squared": 135078.30089999997,
        "min": 367.53,
        "max": 367.53,
        "mean": 367.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 183.2,
        "sum_squared": 33562.24,
        "min": 183.2,
        "max": 183.2,
        "mean": 183.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 183.2,
        "sum_squared": 33562.24,
        "min": 183.2,
        "max": 183.2,
        "mean": 183.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 183.2,
        "sum_squared": 33562.24,
        "min": 183.2,
        "max": 183.2,
        "mean": 183.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 183.2,
        "sum_squared": 33562.24,
        "min": 183.2,
        "max": 183.2,
        "mean": 183.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.70607540845871,
        "sum_squared": 22.147145750099813,
        "min": 4.70607540845871,
        "max": 4.70607540845871,
        "mean": 4.70607540845871,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.70607540845871,
        "sum_squared": 22.147145750099813,
        "min": 4.70607540845871,
        "max": 4.70607540845871,
        "mean": 4.70607540845871,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 183.2,
        "sum_squared": 33562.24,
        "min": 183.2,
        "max": 183.2,
        "mean": 183.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 183.2,
        "sum_squared": 33562.24,
        "min": 183.2,
        "max": 183.2,
        "mean": 183.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 794.18,
        "sum_squared": 630721.8723999999,
        "min": 794.18,
        "max": 794.18,
        "mean": 794.18,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 794.18,
        "sum_squared": 630721.8723999999,
        "min": 794.18,
        "max": 794.18,
        "mean": 794.18,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.81,
        "sum_squared": 0.6561000000000001,
        "min": 0.81,
        "max": 0.81,
        "mean": 0.81,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.81,
        "sum_squared": 0.6561000000000001,
        "min": 0.81,
        "max": 0.81,
        "mean": 0.81,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.81,
        "sum_squared": 0.6561000000000001,
        "min": 0.81,
        "max": 0.81,
        "mean": 0.81,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.5.0/spider:model=openai_gpt-4o-mini-2024-07-18",
    "run_spec": {
      "name": "spider:model=openai_gpt-4o-mini-2024-07-18",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.spider_scenario.SpiderScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "",
        "input_prefix": "",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 100,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-4o-mini-2024-07-18",
        "model": "openai/gpt-4o-mini-2024-07-18",
        "temperature": 0.0,
        "max_tokens": 1024,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": [
              "exact_match",
              "quasi_exact_match",
              "prefix_exact_match",
              "quasi_prefix_exact_match"
            ]
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.spider_metrics.SpiderMetric",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "spider"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.spider_annotator.SpiderAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 367.53,
        "sum_squared": 135078.30089999997,
        "min": 367.53,
        "max": 367.53,
        "mean": 367.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 176.02,
        "sum_squared": 30983.040400000005,
        "min": 176.02,
        "max": 176.02,
        "mean": 176.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 176.02,
        "sum_squared": 30983.040400000005,
        "min": 176.02,
        "max": 176.02,
        "mean": 176.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "valid"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid"
        },
        "count": 1,
        "sum": 3.26744900226593,
        "sum_squared": 10.676222982408623,
        "min": 3.26744900226593,
        "max": 3.26744900226593,
        "mean": 3.26744900226593,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid"
        },
        "count": 1,
        "sum": 176.02,
        "sum_squared": 30983.040400000005,
        "min": 176.02,
        "max": 176.02,
        "mean": 176.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid"
        },
        "count": 1,
        "sum": 754.37,
        "sum_squared": 569074.0969,
        "min": 754.37,
        "max": 754.37,
        "mean": 754.37,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "valid"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 367.53,
        "sum_squared": 135078.30089999997,
        "min": 367.53,
        "max": 367.53,
        "mean": 367.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 367.53,
        "sum_squared": 135078.30089999997,
        "min": 367.53,
        "max": 367.53,
        "mean": 367.53,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 176.02,
        "sum_squared": 30983.040400000005,
        "min": 176.02,
        "max": 176.02,
        "mean": 176.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 176.02,
        "sum_squared": 30983.040400000005,
        "min": 176.02,
        "max": 176.02,
        "mean": 176.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 176.02,
        "sum_squared": 30983.040400000005,
        "min": 176.02,
        "max": 176.02,
        "mean": 176.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 176.02,
        "sum_squared": 30983.040400000005,
        "min": 176.02,
        "max": 176.02,
        "mean": 176.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.26744900226593,
        "sum_squared": 10.676222982408623,
        "min": 3.26744900226593,
        "max": 3.26744900226593,
        "mean": 3.26744900226593,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.26744900226593,
        "sum_squared": 10.676222982408623,
        "min": 3.26744900226593,
        "max": 3.26744900226593,
        "mean": 3.26744900226593,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "quasi_prefix_exact_match",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 176.02,
        "sum_squared": 30983.040400000005,
        "min": 176.02,
        "max": 176.02,
        "mean": 176.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 176.02,
        "sum_squared": 30983.040400000005,
        "min": 176.02,
        "max": 176.02,
        "mean": 176.02,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 754.37,
        "sum_squared": 569074.0969,
        "min": 754.37,
        "max": 754.37,
        "mean": 754.37,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 754.37,
        "sum_squared": 569074.0969,
        "min": 754.37,
        "max": 754.37,
        "mean": 754.37,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "valid"
        },
        "count": 1,
        "sum": 100.0,
        "sum_squared": 10000.0,
        "min": 100.0,
        "max": 100.0,
        "mean": 100.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid"
        },
        "count": 1,
        "sum": 0.72,
        "sum_squared": 0.5184,
        "min": 0.72,
        "max": 0.72,
        "mean": 0.72,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.72,
        "sum_squared": 0.5184,
        "min": 0.72,
        "max": 0.72,
        "mean": 0.72,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "execution_accuracy",
          "split": "valid",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.72,
        "sum_squared": 0.5184,
        "min": 0.72,
        "max": 0.72,
        "mean": 0.72,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  }
]