[
  {
    "run_path": "benchmark_output/runs/v0.1.0/call_center_summarization:model=anthropic_claude-3-5-sonnet-20240620",
    "run_spec": {
      "name": "call_center_summarization:model=anthropic_claude-3-5-sonnet-20240620",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario",
        "args": {
          "revision": "main"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Summarize the call transcript in under 10 sentences.",
        "input_prefix": "### Call Transcript\n",
        "input_suffix": "",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 0,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-3-5-sonnet-20240620",
        "model": "anthropic/claude-3-5-sonnet-20240620",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "faithfulness",
            "min_score": 1,
            "max_score": 5
          }
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "relevance",
            "min_score": 1,
            "max_score": 5
          }
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "coherence",
            "min_score": 1,
            "max_score": 5
          }
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "call_center_summarization"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 801.1083333333333,
        "sum_squared": 641774.5617361112,
        "min": 801.1083333333333,
        "max": 801.1083333333333,
        "mean": 801.1083333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 151.3,
        "sum_squared": 22891.690000000002,
        "min": 151.3,
        "max": 151.3,
        "mean": 151.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 151.3,
        "sum_squared": 22891.690000000002,
        "min": 151.3,
        "max": 151.3,
        "mean": 151.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 5.293568943937619,
        "sum_squared": 28.02187216422084,
        "min": 5.293568943937619,
        "max": 5.293568943937619,
        "mean": 5.293568943937619,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 151.3,
        "sum_squared": 22891.690000000002,
        "min": 151.3,
        "max": 151.3,
        "mean": 151.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 790.7083333333334,
        "sum_squared": 625219.6684027779,
        "min": 790.7083333333334,
        "max": 790.7083333333334,
        "mean": 790.7083333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 801.1083333333333,
        "sum_squared": 641774.5617361112,
        "min": 801.1083333333333,
        "max": 801.1083333333333,
        "mean": 801.1083333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 801.1083333333333,
        "sum_squared": 641774.5617361112,
        "min": 801.1083333333333,
        "max": 801.1083333333333,
        "mean": 801.1083333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 151.3,
        "sum_squared": 22891.690000000002,
        "min": 151.3,
        "max": 151.3,
        "mean": 151.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 151.3,
        "sum_squared": 22891.690000000002,
        "min": 151.3,
        "max": 151.3,
        "mean": 151.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 151.3,
        "sum_squared": 22891.690000000002,
        "min": 151.3,
        "max": 151.3,
        "mean": 151.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 151.3,
        "sum_squared": 22891.690000000002,
        "min": 151.3,
        "max": 151.3,
        "mean": 151.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.293568943937619,
        "sum_squared": 28.02187216422084,
        "min": 5.293568943937619,
        "max": 5.293568943937619,
        "mean": 5.293568943937619,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 5.293568943937619,
        "sum_squared": 28.02187216422084,
        "min": 5.293568943937619,
        "max": 5.293568943937619,
        "mean": 5.293568943937619,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 151.3,
        "sum_squared": 22891.690000000002,
        "min": 151.3,
        "max": 151.3,
        "mean": 151.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 151.3,
        "sum_squared": 22891.690000000002,
        "min": 151.3,
        "max": 151.3,
        "mean": 151.3,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 790.7083333333334,
        "sum_squared": 625219.6684027779,
        "min": 790.7083333333334,
        "max": 790.7083333333334,
        "mean": 790.7083333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 790.7083333333334,
        "sum_squared": 625219.6684027779,
        "min": 790.7083333333334,
        "max": 790.7083333333334,
        "mean": 790.7083333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 240.0,
        "sum_squared": 57600.0,
        "min": 240.0,
        "max": 240.0,
        "mean": 240.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.1.0/call_center_summarization:model=meta_llama-3-70b-chat",
    "run_spec": {
      "name": "call_center_summarization:model=meta_llama-3-70b-chat",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario",
        "args": {
          "revision": "main"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Summarize the call transcript in under 10 sentences.",
        "input_prefix": "### Call Transcript\n",
        "input_suffix": "",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 0,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-3-70b-chat",
        "model": "meta/llama-3-70b-chat",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "faithfulness",
            "min_score": 1,
            "max_score": 5
          }
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "relevance",
            "min_score": 1,
            "max_score": 5
          }
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "coherence",
            "min_score": 1,
            "max_score": 5
          }
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "call_center_summarization"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 731.5833333333334,
        "sum_squared": 535214.1736111111,
        "min": 731.5833333333334,
        "max": 731.5833333333334,
        "mean": 731.5833333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 169.04166666666666,
        "sum_squared": 28575.08506944444,
        "min": 169.04166666666666,
        "max": 169.04166666666666,
        "mean": 169.04166666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 169.04166666666666,
        "sum_squared": 28575.08506944444,
        "min": 169.04166666666666,
        "max": 169.04166666666666,
        "mean": 169.04166666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.9473892460266748,
        "sum_squared": 3.792324875540341,
        "min": 1.9473892460266748,
        "max": 1.9473892460266748,
        "mean": 1.9473892460266748,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": -15.829865674891238,
        "sum_squared": 250.58464728509983,
        "min": -15.829865674891238,
        "max": -15.829865674891238,
        "mean": -15.829865674891238,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 169.04166666666666,
        "sum_squared": 28575.08506944444,
        "min": 169.04166666666666,
        "max": 169.04166666666666,
        "mean": 169.04166666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 877.8458333333333,
        "sum_squared": 770613.3071006944,
        "min": 877.8458333333333,
        "max": 877.8458333333333,
        "mean": 877.8458333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.098169558767986,
        "sum_squared": 1.2059763798046732,
        "min": 1.098169558767986,
        "max": 1.098169558767986,
        "mean": 1.098169558767986,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.02601558023051202,
        "sum_squared": 0.0006768104147302078,
        "min": 0.02601558023051202,
        "max": 0.02601558023051202,
        "mean": 0.02601558023051202,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": -0.018032626087410457,
        "sum_squared": 0.00032517560360835614,
        "min": -0.018032626087410457,
        "max": -0.018032626087410457,
        "mean": -0.018032626087410457,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 731.5833333333334,
        "sum_squared": 535214.1736111111,
        "min": 731.5833333333334,
        "max": 731.5833333333334,
        "mean": 731.5833333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 731.5833333333334,
        "sum_squared": 535214.1736111111,
        "min": 731.5833333333334,
        "max": 731.5833333333334,
        "mean": 731.5833333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.04166666666666,
        "sum_squared": 28575.08506944444,
        "min": 169.04166666666666,
        "max": 169.04166666666666,
        "mean": 169.04166666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.04166666666666,
        "sum_squared": 28575.08506944444,
        "min": 169.04166666666666,
        "max": 169.04166666666666,
        "mean": 169.04166666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.04166666666666,
        "sum_squared": 28575.08506944444,
        "min": 169.04166666666666,
        "max": 169.04166666666666,
        "mean": 169.04166666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.04166666666666,
        "sum_squared": 28575.08506944444,
        "min": 169.04166666666666,
        "max": 169.04166666666666,
        "mean": 169.04166666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.9473892460266748,
        "sum_squared": 3.792324875540341,
        "min": 1.9473892460266748,
        "max": 1.9473892460266748,
        "mean": 1.9473892460266748,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.9473892460266748,
        "sum_squared": 3.792324875540341,
        "min": 1.9473892460266748,
        "max": 1.9473892460266748,
        "mean": 1.9473892460266748,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -15.829865674891238,
        "sum_squared": 250.58464728509983,
        "min": -15.829865674891238,
        "max": -15.829865674891238,
        "mean": -15.829865674891238,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -15.829865674891238,
        "sum_squared": 250.58464728509983,
        "min": -15.829865674891238,
        "max": -15.829865674891238,
        "mean": -15.829865674891238,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.04166666666666,
        "sum_squared": 28575.08506944444,
        "min": 169.04166666666666,
        "max": 169.04166666666666,
        "mean": 169.04166666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.04166666666666,
        "sum_squared": 28575.08506944444,
        "min": 169.04166666666666,
        "max": 169.04166666666666,
        "mean": 169.04166666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 877.8458333333333,
        "sum_squared": 770613.3071006944,
        "min": 877.8458333333333,
        "max": 877.8458333333333,
        "mean": 877.8458333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 877.8458333333333,
        "sum_squared": 770613.3071006944,
        "min": 877.8458333333333,
        "max": 877.8458333333333,
        "mean": 877.8458333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 240.0,
        "sum_squared": 57600.0,
        "min": 240.0,
        "max": 240.0,
        "mean": 240.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.1.0/call_center_summarization:model=meta_llama-3-8b-chat",
    "run_spec": {
      "name": "call_center_summarization:model=meta_llama-3-8b-chat",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario",
        "args": {
          "revision": "main"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Summarize the call transcript in under 10 sentences.",
        "input_prefix": "### Call Transcript\n",
        "input_suffix": "",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 0,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-3-8b-chat",
        "model": "meta/llama-3-8b-chat",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "faithfulness",
            "min_score": 1,
            "max_score": 5
          }
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "relevance",
            "min_score": 1,
            "max_score": 5
          }
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "coherence",
            "min_score": 1,
            "max_score": 5
          }
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "call_center_summarization"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 731.5833333333334,
        "sum_squared": 535214.1736111111,
        "min": 731.5833333333334,
        "max": 731.5833333333334,
        "mean": 731.5833333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 173.4125,
        "sum_squared": 30071.895156249997,
        "min": 173.4125,
        "max": 173.4125,
        "mean": 173.4125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 173.4125,
        "sum_squared": 30071.895156249997,
        "min": 173.4125,
        "max": 173.4125,
        "mean": 173.4125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 2.6472620526949564,
        "sum_squared": 7.007996375638714,
        "min": 2.6472620526949564,
        "max": 2.6472620526949564,
        "mean": 2.6472620526949564,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": -29.036855126500328,
        "sum_squared": 843.1389556373683,
        "min": -29.036855126500328,
        "max": -29.036855126500328,
        "mean": -29.036855126500328,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 173.4125,
        "sum_squared": 30071.895156249997,
        "min": 173.4125,
        "max": 173.4125,
        "mean": 173.4125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 897.2208333333333,
        "sum_squared": 805005.223767361,
        "min": 897.2208333333333,
        "max": 897.2208333333333,
        "mean": 897.2208333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.1822788930548627,
        "sum_squared": 1.3977833809630313,
        "min": 1.1822788930548627,
        "max": 1.1822788930548627,
        "mean": 1.1822788930548627,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0466900960584917,
        "sum_squared": 0.002179965069951182,
        "min": 0.0466900960584917,
        "max": 0.0466900960584917,
        "mean": 0.0466900960584917,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": -0.032363108443016535,
        "sum_squared": 0.0010473707880944481,
        "min": -0.032363108443016535,
        "max": -0.032363108443016535,
        "mean": -0.032363108443016535,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 731.5833333333334,
        "sum_squared": 535214.1736111111,
        "min": 731.5833333333334,
        "max": 731.5833333333334,
        "mean": 731.5833333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 731.5833333333334,
        "sum_squared": 535214.1736111111,
        "min": 731.5833333333334,
        "max": 731.5833333333334,
        "mean": 731.5833333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 173.4125,
        "sum_squared": 30071.895156249997,
        "min": 173.4125,
        "max": 173.4125,
        "mean": 173.4125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 173.4125,
        "sum_squared": 30071.895156249997,
        "min": 173.4125,
        "max": 173.4125,
        "mean": 173.4125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 173.4125,
        "sum_squared": 30071.895156249997,
        "min": 173.4125,
        "max": 173.4125,
        "mean": 173.4125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 173.4125,
        "sum_squared": 30071.895156249997,
        "min": 173.4125,
        "max": 173.4125,
        "mean": 173.4125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.6472620526949564,
        "sum_squared": 7.007996375638714,
        "min": 2.6472620526949564,
        "max": 2.6472620526949564,
        "mean": 2.6472620526949564,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.6472620526949564,
        "sum_squared": 7.007996375638714,
        "min": 2.6472620526949564,
        "max": 2.6472620526949564,
        "mean": 2.6472620526949564,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -29.036855126500328,
        "sum_squared": 843.1389556373683,
        "min": -29.036855126500328,
        "max": -29.036855126500328,
        "mean": -29.036855126500328,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -29.036855126500328,
        "sum_squared": 843.1389556373683,
        "min": -29.036855126500328,
        "max": -29.036855126500328,
        "mean": -29.036855126500328,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 173.4125,
        "sum_squared": 30071.895156249997,
        "min": 173.4125,
        "max": 173.4125,
        "mean": 173.4125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 173.4125,
        "sum_squared": 30071.895156249997,
        "min": 173.4125,
        "max": 173.4125,
        "mean": 173.4125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 897.2208333333333,
        "sum_squared": 805005.223767361,
        "min": 897.2208333333333,
        "max": 897.2208333333333,
        "mean": 897.2208333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 897.2208333333333,
        "sum_squared": 805005.223767361,
        "min": 897.2208333333333,
        "max": 897.2208333333333,
        "mean": 897.2208333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 240.0,
        "sum_squared": 57600.0,
        "min": 240.0,
        "max": 240.0,
        "mean": 240.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.1.0/call_center_summarization:model=openai_gpt-4o-2024-05-13",
    "run_spec": {
      "name": "call_center_summarization:model=openai_gpt-4o-2024-05-13",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario",
        "args": {
          "revision": "main"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Summarize the call transcript in under 10 sentences.",
        "input_prefix": "### Call Transcript\n",
        "input_suffix": "",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 0,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-4o-2024-05-13",
        "model": "openai/gpt-4o-2024-05-13",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "faithfulness",
            "min_score": 1,
            "max_score": 5
          }
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "relevance",
            "min_score": 1,
            "max_score": 5
          }
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "coherence",
            "min_score": 1,
            "max_score": 5
          }
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "call_center_summarization"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 728.7625,
        "sum_squared": 531094.7814062501,
        "min": 728.7625,
        "max": 728.7625,
        "mean": 728.7625,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 155.52916666666667,
        "sum_squared": 24189.32168402778,
        "min": 155.52916666666667,
        "max": 155.52916666666667,
        "mean": 155.52916666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 155.52916666666667,
        "sum_squared": 24189.32168402778,
        "min": 155.52916666666667,
        "max": 155.52916666666667,
        "mean": 155.52916666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 2.3968505134185154,
        "sum_squared": 5.744892383674601,
        "min": 2.3968505134185154,
        "max": 2.3968505134185154,
        "mean": 2.3968505134185154,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 155.52916666666667,
        "sum_squared": 24189.32168402778,
        "min": 155.52916666666667,
        "max": 155.52916666666667,
        "mean": 155.52916666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 841.4791666666666,
        "sum_squared": 708087.1879340278,
        "min": 841.4791666666666,
        "max": 841.4791666666666,
        "mean": 841.4791666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 728.7625,
        "sum_squared": 531094.7814062501,
        "min": 728.7625,
        "max": 728.7625,
        "mean": 728.7625,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 728.7625,
        "sum_squared": 531094.7814062501,
        "min": 728.7625,
        "max": 728.7625,
        "mean": 728.7625,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 155.52916666666667,
        "sum_squared": 24189.32168402778,
        "min": 155.52916666666667,
        "max": 155.52916666666667,
        "mean": 155.52916666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 155.52916666666667,
        "sum_squared": 24189.32168402778,
        "min": 155.52916666666667,
        "max": 155.52916666666667,
        "mean": 155.52916666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 155.52916666666667,
        "sum_squared": 24189.32168402778,
        "min": 155.52916666666667,
        "max": 155.52916666666667,
        "mean": 155.52916666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 155.52916666666667,
        "sum_squared": 24189.32168402778,
        "min": 155.52916666666667,
        "max": 155.52916666666667,
        "mean": 155.52916666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.3968505134185154,
        "sum_squared": 5.744892383674601,
        "min": 2.3968505134185154,
        "max": 2.3968505134185154,
        "mean": 2.3968505134185154,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.3968505134185154,
        "sum_squared": 5.744892383674601,
        "min": 2.3968505134185154,
        "max": 2.3968505134185154,
        "mean": 2.3968505134185154,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 155.52916666666667,
        "sum_squared": 24189.32168402778,
        "min": 155.52916666666667,
        "max": 155.52916666666667,
        "mean": 155.52916666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 155.52916666666667,
        "sum_squared": 24189.32168402778,
        "min": 155.52916666666667,
        "max": 155.52916666666667,
        "mean": 155.52916666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 841.4791666666666,
        "sum_squared": 708087.1879340278,
        "min": 841.4791666666666,
        "max": 841.4791666666666,
        "mean": 841.4791666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 841.4791666666666,
        "sum_squared": 708087.1879340278,
        "min": 841.4791666666666,
        "max": 841.4791666666666,
        "mean": 841.4791666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 240.0,
        "sum_squared": 57600.0,
        "min": 240.0,
        "max": 240.0,
        "mean": 240.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.1.0/call_center_summarization:model=openai_gpt-4o-mini-2024-07-18",
    "run_spec": {
      "name": "call_center_summarization:model=openai_gpt-4o-mini-2024-07-18",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario",
        "args": {
          "revision": "main"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Summarize the call transcript in under 10 sentences.",
        "input_prefix": "### Call Transcript\n",
        "input_suffix": "",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 0,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-4o-mini-2024-07-18",
        "model": "openai/gpt-4o-mini-2024-07-18",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "faithfulness",
            "min_score": 1,
            "max_score": 5
          }
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "relevance",
            "min_score": 1,
            "max_score": 5
          }
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "coherence",
            "min_score": 1,
            "max_score": 5
          }
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "call_center_summarization"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 728.7625,
        "sum_squared": 531094.7814062501,
        "min": 728.7625,
        "max": 728.7625,
        "mean": 728.7625,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 178.33333333333334,
        "sum_squared": 31802.77777777778,
        "min": 178.33333333333334,
        "max": 178.33333333333334,
        "mean": 178.33333333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 178.33333333333334,
        "sum_squared": 31802.77777777778,
        "min": 178.33333333333334,
        "max": 178.33333333333334,
        "mean": 178.33333333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 3.2153423209985097,
        "sum_squared": 10.338426241204083,
        "min": 3.2153423209985097,
        "max": 3.2153423209985097,
        "mean": 3.2153423209985097,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 178.33333333333334,
        "sum_squared": 31802.77777777778,
        "min": 178.33333333333334,
        "max": 178.33333333333334,
        "mean": 178.33333333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 964.7083333333334,
        "sum_squared": 930662.1684027779,
        "min": 964.7083333333334,
        "max": 964.7083333333334,
        "mean": 964.7083333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 728.7625,
        "sum_squared": 531094.7814062501,
        "min": 728.7625,
        "max": 728.7625,
        "mean": 728.7625,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 728.7625,
        "sum_squared": 531094.7814062501,
        "min": 728.7625,
        "max": 728.7625,
        "mean": 728.7625,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 178.33333333333334,
        "sum_squared": 31802.77777777778,
        "min": 178.33333333333334,
        "max": 178.33333333333334,
        "mean": 178.33333333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 178.33333333333334,
        "sum_squared": 31802.77777777778,
        "min": 178.33333333333334,
        "max": 178.33333333333334,
        "mean": 178.33333333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 178.33333333333334,
        "sum_squared": 31802.77777777778,
        "min": 178.33333333333334,
        "max": 178.33333333333334,
        "mean": 178.33333333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 178.33333333333334,
        "sum_squared": 31802.77777777778,
        "min": 178.33333333333334,
        "max": 178.33333333333334,
        "mean": 178.33333333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.2153423209985097,
        "sum_squared": 10.338426241204083,
        "min": 3.2153423209985097,
        "max": 3.2153423209985097,
        "mean": 3.2153423209985097,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.2153423209985097,
        "sum_squared": 10.338426241204083,
        "min": 3.2153423209985097,
        "max": 3.2153423209985097,
        "mean": 3.2153423209985097,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 178.33333333333334,
        "sum_squared": 31802.77777777778,
        "min": 178.33333333333334,
        "max": 178.33333333333334,
        "mean": 178.33333333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 178.33333333333334,
        "sum_squared": 31802.77777777778,
        "min": 178.33333333333334,
        "max": 178.33333333333334,
        "mean": 178.33333333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 964.7083333333334,
        "sum_squared": 930662.1684027779,
        "min": 964.7083333333334,
        "max": 964.7083333333334,
        "mean": 964.7083333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 964.7083333333334,
        "sum_squared": 930662.1684027779,
        "min": 964.7083333333334,
        "max": 964.7083333333334,
        "mean": 964.7083333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 240.0,
        "sum_squared": 57600.0,
        "min": 240.0,
        "max": 240.0,
        "mean": 240.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.1.0/call_center_summarization:subset=real_call_transcripts,model=anthropic_claude-3-5-sonnet-20240620",
    "run_spec": {
      "name": "call_center_summarization:subset=real_call_transcripts,model=anthropic_claude-3-5-sonnet-20240620",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario",
        "args": {
          "subset": "real_call_transcripts"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Summarize the call transcript in under 10 sentences.",
        "input_prefix": "### Call Transcript\n",
        "input_suffix": "",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 0,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-3-5-sonnet-20240620",
        "model": "anthropic/claude-3-5-sonnet-20240620",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "faithfulness",
            "min_score": 1,
            "max_score": 5
          }
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "relevance",
            "min_score": 1,
            "max_score": 5
          }
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "coherence",
            "min_score": 1,
            "max_score": 5
          }
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "call_center_summarization_real_call_transcripts"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1726.4,
        "sum_squared": 2980456.9600000004,
        "min": 1726.4,
        "max": 1726.4,
        "mean": 1726.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 173.0,
        "sum_squared": 29929.0,
        "min": 173.0,
        "max": 173.0,
        "mean": 173.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 173.0,
        "sum_squared": 29929.0,
        "min": 173.0,
        "max": 173.0,
        "mean": 173.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 4.378824758529663,
        "sum_squared": 19.17410626591236,
        "min": 4.378824758529663,
        "max": 4.378824758529663,
        "mean": 4.378824758529663,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 173.0,
        "sum_squared": 29929.0,
        "min": 173.0,
        "max": 173.0,
        "mean": 173.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 846.8,
        "sum_squared": 717070.2399999999,
        "min": 846.8,
        "max": 846.8,
        "mean": 846.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1726.4,
        "sum_squared": 2980456.9600000004,
        "min": 1726.4,
        "max": 1726.4,
        "mean": 1726.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1726.4,
        "sum_squared": 2980456.9600000004,
        "min": 1726.4,
        "max": 1726.4,
        "mean": 1726.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 173.0,
        "sum_squared": 29929.0,
        "min": 173.0,
        "max": 173.0,
        "mean": 173.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 173.0,
        "sum_squared": 29929.0,
        "min": 173.0,
        "max": 173.0,
        "mean": 173.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 173.0,
        "sum_squared": 29929.0,
        "min": 173.0,
        "max": 173.0,
        "mean": 173.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 173.0,
        "sum_squared": 29929.0,
        "min": 173.0,
        "max": 173.0,
        "mean": 173.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.378824758529663,
        "sum_squared": 19.17410626591236,
        "min": 4.378824758529663,
        "max": 4.378824758529663,
        "mean": 4.378824758529663,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 4.378824758529663,
        "sum_squared": 19.17410626591236,
        "min": 4.378824758529663,
        "max": 4.378824758529663,
        "mean": 4.378824758529663,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 173.0,
        "sum_squared": 29929.0,
        "min": 173.0,
        "max": 173.0,
        "mean": 173.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 173.0,
        "sum_squared": 29929.0,
        "min": 173.0,
        "max": 173.0,
        "mean": 173.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 846.8,
        "sum_squared": 717070.2399999999,
        "min": 846.8,
        "max": 846.8,
        "mean": 846.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 846.8,
        "sum_squared": 717070.2399999999,
        "min": 846.8,
        "max": 846.8,
        "mean": 846.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.1.0/call_center_summarization:subset=real_call_transcripts,model=meta_llama-3-70b-chat",
    "run_spec": {
      "name": "call_center_summarization:subset=real_call_transcripts,model=meta_llama-3-70b-chat",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario",
        "args": {
          "subset": "real_call_transcripts"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Summarize the call transcript in under 10 sentences.",
        "input_prefix": "### Call Transcript\n",
        "input_suffix": "",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 0,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-3-70b-chat",
        "model": "meta/llama-3-70b-chat",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "faithfulness",
            "min_score": 1,
            "max_score": 5
          }
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "relevance",
            "min_score": 1,
            "max_score": 5
          }
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "coherence",
            "min_score": 1,
            "max_score": 5
          }
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "call_center_summarization_real_call_transcripts"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1688.0,
        "sum_squared": 2849344.0,
        "min": 1688.0,
        "max": 1688.0,
        "mean": 1688.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 171.0,
        "sum_squared": 29241.0,
        "min": 171.0,
        "max": 171.0,
        "mean": 171.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 171.0,
        "sum_squared": 29241.0,
        "min": 171.0,
        "max": 171.0,
        "mean": 171.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.9109180927276612,
        "sum_squared": 3.6516079571139226,
        "min": 1.9109180927276612,
        "max": 1.9109180927276612,
        "mean": 1.9109180927276612,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": -18.555278973548532,
        "sum_squared": 344.29837778621226,
        "min": -18.555278973548532,
        "max": -18.555278973548532,
        "mean": -18.555278973548532,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 171.0,
        "sum_squared": 29241.0,
        "min": 171.0,
        "max": 171.0,
        "mean": 171.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 850.4,
        "sum_squared": 723180.1599999999,
        "min": 850.4,
        "max": 850.4,
        "mean": 850.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.1146165042281866,
        "sum_squared": 1.242369951497863,
        "min": 1.1146165042281866,
        "max": 1.1146165042281866,
        "mean": 1.1146165042281866,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.031478844023341626,
        "sum_squared": 0.0009909176210458708,
        "min": 0.031478844023341626,
        "max": 0.031478844023341626,
        "mean": 0.031478844023341626,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": -0.021819471982065533,
        "sum_squared": 0.0004760893575761428,
        "min": -0.021819471982065533,
        "max": -0.021819471982065533,
        "mean": -0.021819471982065533,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1688.0,
        "sum_squared": 2849344.0,
        "min": 1688.0,
        "max": 1688.0,
        "mean": 1688.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1688.0,
        "sum_squared": 2849344.0,
        "min": 1688.0,
        "max": 1688.0,
        "mean": 1688.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 171.0,
        "sum_squared": 29241.0,
        "min": 171.0,
        "max": 171.0,
        "mean": 171.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 171.0,
        "sum_squared": 29241.0,
        "min": 171.0,
        "max": 171.0,
        "mean": 171.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 171.0,
        "sum_squared": 29241.0,
        "min": 171.0,
        "max": 171.0,
        "mean": 171.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 171.0,
        "sum_squared": 29241.0,
        "min": 171.0,
        "max": 171.0,
        "mean": 171.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.9109180927276612,
        "sum_squared": 3.6516079571139226,
        "min": 1.9109180927276612,
        "max": 1.9109180927276612,
        "mean": 1.9109180927276612,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.9109180927276612,
        "sum_squared": 3.6516079571139226,
        "min": 1.9109180927276612,
        "max": 1.9109180927276612,
        "mean": 1.9109180927276612,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -18.555278973548532,
        "sum_squared": 344.29837778621226,
        "min": -18.555278973548532,
        "max": -18.555278973548532,
        "mean": -18.555278973548532,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -18.555278973548532,
        "sum_squared": 344.29837778621226,
        "min": -18.555278973548532,
        "max": -18.555278973548532,
        "mean": -18.555278973548532,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 171.0,
        "sum_squared": 29241.0,
        "min": 171.0,
        "max": 171.0,
        "mean": 171.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 171.0,
        "sum_squared": 29241.0,
        "min": 171.0,
        "max": 171.0,
        "mean": 171.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 850.4,
        "sum_squared": 723180.1599999999,
        "min": 850.4,
        "max": 850.4,
        "mean": 850.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 850.4,
        "sum_squared": 723180.1599999999,
        "min": 850.4,
        "max": 850.4,
        "mean": 850.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.1.0/call_center_summarization:subset=real_call_transcripts,model=meta_llama-3-8b-chat",
    "run_spec": {
      "name": "call_center_summarization:subset=real_call_transcripts,model=meta_llama-3-8b-chat",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario",
        "args": {
          "subset": "real_call_transcripts"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Summarize the call transcript in under 10 sentences.",
        "input_prefix": "### Call Transcript\n",
        "input_suffix": "",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 0,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-3-8b-chat",
        "model": "meta/llama-3-8b-chat",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "faithfulness",
            "min_score": 1,
            "max_score": 5
          }
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "relevance",
            "min_score": 1,
            "max_score": 5
          }
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "coherence",
            "min_score": 1,
            "max_score": 5
          }
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "call_center_summarization_real_call_transcripts"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1688.0,
        "sum_squared": 2849344.0,
        "min": 1688.0,
        "max": 1688.0,
        "mean": 1688.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 185.2,
        "sum_squared": 34299.03999999999,
        "min": 185.2,
        "max": 185.2,
        "mean": 185.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 185.2,
        "sum_squared": 34299.03999999999,
        "min": 185.2,
        "max": 185.2,
        "mean": 185.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.1614038467407226,
        "sum_squared": 1.3488588952241478,
        "min": 1.1614038467407226,
        "max": 1.1614038467407226,
        "mean": 1.1614038467407226,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": -36.314123199015285,
        "sum_squared": 1318.71554371326,
        "min": -36.314123199015285,
        "max": -36.314123199015285,
        "mean": -36.314123199015285,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 185.2,
        "sum_squared": 34299.03999999999,
        "min": 185.2,
        "max": 185.2,
        "mean": 185.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 935.6,
        "sum_squared": 875347.36,
        "min": 935.6,
        "max": 935.6,
        "mean": 935.6,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.216624935596565,
        "sum_squared": 1.480176233915346,
        "min": 1.216624935596565,
        "max": 1.216624935596565,
        "mean": 1.216624935596565,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.05599637179718919,
        "sum_squared": 0.003135593654449045,
        "min": 0.05599637179718919,
        "max": 0.05599637179718919,
        "mean": 0.05599637179718919,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": -0.038813727232808126,
        "sum_squared": 0.001506505421702831,
        "min": -0.038813727232808126,
        "max": -0.038813727232808126,
        "mean": -0.038813727232808126,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1688.0,
        "sum_squared": 2849344.0,
        "min": 1688.0,
        "max": 1688.0,
        "mean": 1688.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1688.0,
        "sum_squared": 2849344.0,
        "min": 1688.0,
        "max": 1688.0,
        "mean": 1688.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 185.2,
        "sum_squared": 34299.03999999999,
        "min": 185.2,
        "max": 185.2,
        "mean": 185.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 185.2,
        "sum_squared": 34299.03999999999,
        "min": 185.2,
        "max": 185.2,
        "mean": 185.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 185.2,
        "sum_squared": 34299.03999999999,
        "min": 185.2,
        "max": 185.2,
        "mean": 185.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 185.2,
        "sum_squared": 34299.03999999999,
        "min": 185.2,
        "max": 185.2,
        "mean": 185.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.1614038467407226,
        "sum_squared": 1.3488588952241478,
        "min": 1.1614038467407226,
        "max": 1.1614038467407226,
        "mean": 1.1614038467407226,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.1614038467407226,
        "sum_squared": 1.3488588952241478,
        "min": 1.1614038467407226,
        "max": 1.1614038467407226,
        "mean": 1.1614038467407226,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -36.314123199015285,
        "sum_squared": 1318.71554371326,
        "min": -36.314123199015285,
        "max": -36.314123199015285,
        "mean": -36.314123199015285,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -36.314123199015285,
        "sum_squared": 1318.71554371326,
        "min": -36.314123199015285,
        "max": -36.314123199015285,
        "mean": -36.314123199015285,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 185.2,
        "sum_squared": 34299.03999999999,
        "min": 185.2,
        "max": 185.2,
        "mean": 185.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 185.2,
        "sum_squared": 34299.03999999999,
        "min": 185.2,
        "max": 185.2,
        "mean": 185.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 935.6,
        "sum_squared": 875347.36,
        "min": 935.6,
        "max": 935.6,
        "mean": 935.6,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 935.6,
        "sum_squared": 875347.36,
        "min": 935.6,
        "max": 935.6,
        "mean": 935.6,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.1.0/call_center_summarization:subset=real_call_transcripts,model=openai_gpt-4o-2024-05-13",
    "run_spec": {
      "name": "call_center_summarization:subset=real_call_transcripts,model=openai_gpt-4o-2024-05-13",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario",
        "args": {
          "subset": "real_call_transcripts"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Summarize the call transcript in under 10 sentences.",
        "input_prefix": "### Call Transcript\n",
        "input_suffix": "",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 0,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-4o-2024-05-13",
        "model": "openai/gpt-4o-2024-05-13",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "faithfulness",
            "min_score": 1,
            "max_score": 5
          }
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "relevance",
            "min_score": 1,
            "max_score": 5
          }
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "coherence",
            "min_score": 1,
            "max_score": 5
          }
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "call_center_summarization_real_call_transcripts"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1642.6,
        "sum_squared": 2698134.76,
        "min": 1642.6,
        "max": 1642.6,
        "mean": 1642.6,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 142.8,
        "sum_squared": 20391.840000000004,
        "min": 142.8,
        "max": 142.8,
        "mean": 142.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 142.8,
        "sum_squared": 20391.840000000004,
        "min": 142.8,
        "max": 142.8,
        "mean": 142.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.9384600639343261,
        "sum_squared": 3.757627419468272,
        "min": 1.9384600639343261,
        "max": 1.9384600639343261,
        "mean": 1.9384600639343261,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 142.8,
        "sum_squared": 20391.840000000004,
        "min": 142.8,
        "max": 142.8,
        "mean": 142.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 754.6,
        "sum_squared": 569421.16,
        "min": 754.6,
        "max": 754.6,
        "mean": 754.6,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1642.6,
        "sum_squared": 2698134.76,
        "min": 1642.6,
        "max": 1642.6,
        "mean": 1642.6,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1642.6,
        "sum_squared": 2698134.76,
        "min": 1642.6,
        "max": 1642.6,
        "mean": 1642.6,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 142.8,
        "sum_squared": 20391.840000000004,
        "min": 142.8,
        "max": 142.8,
        "mean": 142.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 142.8,
        "sum_squared": 20391.840000000004,
        "min": 142.8,
        "max": 142.8,
        "mean": 142.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 142.8,
        "sum_squared": 20391.840000000004,
        "min": 142.8,
        "max": 142.8,
        "mean": 142.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 142.8,
        "sum_squared": 20391.840000000004,
        "min": 142.8,
        "max": 142.8,
        "mean": 142.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.9384600639343261,
        "sum_squared": 3.757627419468272,
        "min": 1.9384600639343261,
        "max": 1.9384600639343261,
        "mean": 1.9384600639343261,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.9384600639343261,
        "sum_squared": 3.757627419468272,
        "min": 1.9384600639343261,
        "max": 1.9384600639343261,
        "mean": 1.9384600639343261,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 142.8,
        "sum_squared": 20391.840000000004,
        "min": 142.8,
        "max": 142.8,
        "mean": 142.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 142.8,
        "sum_squared": 20391.840000000004,
        "min": 142.8,
        "max": 142.8,
        "mean": 142.8,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 754.6,
        "sum_squared": 569421.16,
        "min": 754.6,
        "max": 754.6,
        "mean": 754.6,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 754.6,
        "sum_squared": 569421.16,
        "min": 754.6,
        "max": 754.6,
        "mean": 754.6,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test"
        },
        "count": 1,
        "sum": 0.95,
        "sum_squared": 0.9025,
        "min": 0.95,
        "max": 0.95,
        "mean": 0.95,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.95,
        "sum_squared": 0.9025,
        "min": 0.95,
        "max": 0.95,
        "mean": 0.95,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.95,
        "sum_squared": 0.9025,
        "min": 0.95,
        "max": 0.95,
        "mean": 0.95,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.1.0/call_center_summarization:subset=real_call_transcripts,model=openai_gpt-4o-mini-2024-07-18",
    "run_spec": {
      "name": "call_center_summarization:subset=real_call_transcripts,model=openai_gpt-4o-mini-2024-07-18",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario",
        "args": {
          "subset": "real_call_transcripts"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Summarize the call transcript in under 10 sentences.",
        "input_prefix": "### Call Transcript\n",
        "input_suffix": "",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 0,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-4o-mini-2024-07-18",
        "model": "openai/gpt-4o-mini-2024-07-18",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "faithfulness",
            "min_score": 1,
            "max_score": 5
          }
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "relevance",
            "min_score": 1,
            "max_score": 5
          }
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
          "args": {
            "annotator_name": "call_center_summarization",
            "key": "coherence",
            "min_score": 1,
            "max_score": 5
          }
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "call_center_summarization_real_call_transcripts"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1642.6,
        "sum_squared": 2698134.76,
        "min": 1642.6,
        "max": 1642.6,
        "mean": 1642.6,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 164.0,
        "sum_squared": 26896.0,
        "min": 164.0,
        "max": 164.0,
        "mean": 164.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 164.0,
        "sum_squared": 26896.0,
        "min": 164.0,
        "max": 164.0,
        "mean": 164.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 2.585325527191162,
        "sum_squared": 6.683908081546259,
        "min": 2.585325527191162,
        "max": 2.585325527191162,
        "mean": 2.585325527191162,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 164.0,
        "sum_squared": 26896.0,
        "min": 164.0,
        "max": 164.0,
        "mean": 164.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 875.4,
        "sum_squared": 766325.1599999999,
        "min": 875.4,
        "max": 875.4,
        "mean": 875.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1642.6,
        "sum_squared": 2698134.76,
        "min": 1642.6,
        "max": 1642.6,
        "mean": 1642.6,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1642.6,
        "sum_squared": 2698134.76,
        "min": 1642.6,
        "max": 1642.6,
        "mean": 1642.6,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 164.0,
        "sum_squared": 26896.0,
        "min": 164.0,
        "max": 164.0,
        "mean": 164.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 164.0,
        "sum_squared": 26896.0,
        "min": 164.0,
        "max": 164.0,
        "mean": 164.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 164.0,
        "sum_squared": 26896.0,
        "min": 164.0,
        "max": 164.0,
        "mean": 164.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 164.0,
        "sum_squared": 26896.0,
        "min": 164.0,
        "max": 164.0,
        "mean": 164.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.585325527191162,
        "sum_squared": 6.683908081546259,
        "min": 2.585325527191162,
        "max": 2.585325527191162,
        "mean": 2.585325527191162,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.585325527191162,
        "sum_squared": 6.683908081546259,
        "min": 2.585325527191162,
        "max": 2.585325527191162,
        "mean": 2.585325527191162,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 164.0,
        "sum_squared": 26896.0,
        "min": 164.0,
        "max": 164.0,
        "mean": 164.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 164.0,
        "sum_squared": 26896.0,
        "min": 164.0,
        "max": 164.0,
        "mean": 164.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 875.4,
        "sum_squared": 766325.1599999999,
        "min": 875.4,
        "max": 875.4,
        "mean": 875.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 875.4,
        "sum_squared": 766325.1599999999,
        "min": 875.4,
        "max": 875.4,
        "mean": 875.4,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 5.0,
        "sum_squared": 25.0,
        "min": 5.0,
        "max": 5.0,
        "mean": 5.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_relevance",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_coherence",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.1.0/call_center_summarization_key_points_recall:model=anthropic_claude-3-5-sonnet-20240620",
    "run_spec": {
      "name": "call_center_summarization_key_points_recall:model=anthropic_claude-3-5-sonnet-20240620",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationKeyPointsRecallScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Summarize the call transcript in under 10 sentences.",
        "input_prefix": "### Call Transcript\n",
        "input_suffix": "",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 0,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-3-5-sonnet-20240620",
        "model": "anthropic/claude-3-5-sonnet-20240620",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
          "args": {
            "annotator_name": "call_center_summarization_key_points_recall",
            "key": "score"
          }
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "call_center_summarization_key_points_recall"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationKeyPointsRecallAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 10.0,
        "sum_squared": 100.0,
        "min": 10.0,
        "max": 10.0,
        "mean": 10.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 801.1083333333333,
        "sum_squared": 641774.5617361112,
        "min": 801.1083333333333,
        "max": 801.1083333333333,
        "mean": 801.1083333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 151.10833333333332,
        "sum_squared": 22833.728402777775,
        "min": 151.10833333333332,
        "max": 151.10833333333332,
        "mean": 151.10833333333332,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 151.10833333333332,
        "sum_squared": 22833.728402777775,
        "min": 151.10833333333332,
        "max": 151.10833333333332,
        "mean": 151.10833333333332,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 3.434149192770322,
        "sum_squared": 11.793380678205054,
        "min": 3.434149192770322,
        "max": 3.434149192770322,
        "mean": 3.434149192770322,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 151.10833333333332,
        "sum_squared": 22833.728402777775,
        "min": 151.10833333333332,
        "max": 151.10833333333332,
        "mean": 151.10833333333332,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 790.0666666666667,
        "sum_squared": 624205.3377777778,
        "min": 790.0666666666667,
        "max": 790.0666666666667,
        "mean": 790.0666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 10.0,
        "sum_squared": 100.0,
        "min": 10.0,
        "max": 10.0,
        "mean": 10.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 10.0,
        "sum_squared": 100.0,
        "min": 10.0,
        "max": 10.0,
        "mean": 10.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 801.1083333333333,
        "sum_squared": 641774.5617361112,
        "min": 801.1083333333333,
        "max": 801.1083333333333,
        "mean": 801.1083333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 801.1083333333333,
        "sum_squared": 641774.5617361112,
        "min": 801.1083333333333,
        "max": 801.1083333333333,
        "mean": 801.1083333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 151.10833333333332,
        "sum_squared": 22833.728402777775,
        "min": 151.10833333333332,
        "max": 151.10833333333332,
        "mean": 151.10833333333332,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 151.10833333333332,
        "sum_squared": 22833.728402777775,
        "min": 151.10833333333332,
        "max": 151.10833333333332,
        "mean": 151.10833333333332,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 151.10833333333332,
        "sum_squared": 22833.728402777775,
        "min": 151.10833333333332,
        "max": 151.10833333333332,
        "mean": 151.10833333333332,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 151.10833333333332,
        "sum_squared": 22833.728402777775,
        "min": 151.10833333333332,
        "max": 151.10833333333332,
        "mean": 151.10833333333332,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.434149192770322,
        "sum_squared": 11.793380678205054,
        "min": 3.434149192770322,
        "max": 3.434149192770322,
        "mean": 3.434149192770322,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.434149192770322,
        "sum_squared": 11.793380678205054,
        "min": 3.434149192770322,
        "max": 3.434149192770322,
        "mean": 3.434149192770322,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 151.10833333333332,
        "sum_squared": 22833.728402777775,
        "min": 151.10833333333332,
        "max": 151.10833333333332,
        "mean": 151.10833333333332,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 151.10833333333332,
        "sum_squared": 22833.728402777775,
        "min": 151.10833333333332,
        "max": 151.10833333333332,
        "mean": 151.10833333333332,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 790.0666666666667,
        "sum_squared": 624205.3377777778,
        "min": 790.0666666666667,
        "max": 790.0666666666667,
        "mean": 790.0666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 790.0666666666667,
        "sum_squared": 624205.3377777778,
        "min": 790.0666666666667,
        "max": 790.0666666666667,
        "mean": 790.0666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 240.0,
        "sum_squared": 57600.0,
        "min": 240.0,
        "max": 240.0,
        "mean": 240.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_key_points_recall_score",
          "split": "test"
        },
        "count": 1,
        "sum": 0.8287500000000013,
        "sum_squared": 0.6868265625000022,
        "min": 0.8287500000000013,
        "max": 0.8287500000000013,
        "mean": 0.8287500000000013,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_key_points_recall_score",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8287500000000013,
        "sum_squared": 0.6868265625000022,
        "min": 0.8287500000000013,
        "max": 0.8287500000000013,
        "mean": 0.8287500000000013,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_key_points_recall_score",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8287500000000013,
        "sum_squared": 0.6868265625000022,
        "min": 0.8287500000000013,
        "max": 0.8287500000000013,
        "mean": 0.8287500000000013,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.1.0/call_center_summarization_key_points_recall:model=meta_llama-3-70b-chat",
    "run_spec": {
      "name": "call_center_summarization_key_points_recall:model=meta_llama-3-70b-chat",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationKeyPointsRecallScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Summarize the call transcript in under 10 sentences.",
        "input_prefix": "### Call Transcript\n",
        "input_suffix": "",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 0,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-3-70b-chat",
        "model": "meta/llama-3-70b-chat",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
          "args": {
            "annotator_name": "call_center_summarization_key_points_recall",
            "key": "score"
          }
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "call_center_summarization_key_points_recall"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationKeyPointsRecallAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 10.0,
        "sum_squared": 100.0,
        "min": 10.0,
        "max": 10.0,
        "mean": 10.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 731.5833333333334,
        "sum_squared": 535214.1736111111,
        "min": 731.5833333333334,
        "max": 731.5833333333334,
        "mean": 731.5833333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 169.7125,
        "sum_squared": 28802.33265625,
        "min": 169.7125,
        "max": 169.7125,
        "mean": 169.7125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 169.7125,
        "sum_squared": 28802.33265625,
        "min": 169.7125,
        "max": 169.7125,
        "mean": 169.7125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.58345062037309,
        "sum_squared": 2.5073158671599236,
        "min": 1.58345062037309,
        "max": 1.58345062037309,
        "mean": 1.58345062037309,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": -15.858516478547493,
        "sum_squared": 251.4925449003624,
        "min": -15.858516478547493,
        "max": -15.858516478547493,
        "mean": -15.858516478547493,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 169.7125,
        "sum_squared": 28802.33265625,
        "min": 169.7125,
        "max": 169.7125,
        "mean": 169.7125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 880.7625,
        "sum_squared": 775742.5814062501,
        "min": 880.7625,
        "max": 880.7625,
        "mean": 880.7625,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0979484800586545,
        "sum_squared": 1.2054908648631097,
        "min": 1.0979484800586545,
        "max": 1.0979484800586545,
        "mean": 1.0979484800586545,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.025976359210861468,
        "sum_squared": 0.0006747712378517074,
        "min": 0.025976359210861468,
        "max": 0.025976359210861468,
        "mean": 0.025976359210861468,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": -0.018005440148220992,
        "sum_squared": 0.00032419587493116836,
        "min": -0.018005440148220992,
        "max": -0.018005440148220992,
        "mean": -0.018005440148220992,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 10.0,
        "sum_squared": 100.0,
        "min": 10.0,
        "max": 10.0,
        "mean": 10.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 10.0,
        "sum_squared": 100.0,
        "min": 10.0,
        "max": 10.0,
        "mean": 10.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 731.5833333333334,
        "sum_squared": 535214.1736111111,
        "min": 731.5833333333334,
        "max": 731.5833333333334,
        "mean": 731.5833333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 731.5833333333334,
        "sum_squared": 535214.1736111111,
        "min": 731.5833333333334,
        "max": 731.5833333333334,
        "mean": 731.5833333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.7125,
        "sum_squared": 28802.33265625,
        "min": 169.7125,
        "max": 169.7125,
        "mean": 169.7125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.7125,
        "sum_squared": 28802.33265625,
        "min": 169.7125,
        "max": 169.7125,
        "mean": 169.7125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.7125,
        "sum_squared": 28802.33265625,
        "min": 169.7125,
        "max": 169.7125,
        "mean": 169.7125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.7125,
        "sum_squared": 28802.33265625,
        "min": 169.7125,
        "max": 169.7125,
        "mean": 169.7125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.58345062037309,
        "sum_squared": 2.5073158671599236,
        "min": 1.58345062037309,
        "max": 1.58345062037309,
        "mean": 1.58345062037309,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.58345062037309,
        "sum_squared": 2.5073158671599236,
        "min": 1.58345062037309,
        "max": 1.58345062037309,
        "mean": 1.58345062037309,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -15.858516478547493,
        "sum_squared": 251.4925449003624,
        "min": -15.858516478547493,
        "max": -15.858516478547493,
        "mean": -15.858516478547493,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -15.858516478547493,
        "sum_squared": 251.4925449003624,
        "min": -15.858516478547493,
        "max": -15.858516478547493,
        "mean": -15.858516478547493,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.7125,
        "sum_squared": 28802.33265625,
        "min": 169.7125,
        "max": 169.7125,
        "mean": 169.7125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.7125,
        "sum_squared": 28802.33265625,
        "min": 169.7125,
        "max": 169.7125,
        "mean": 169.7125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 880.7625,
        "sum_squared": 775742.5814062501,
        "min": 880.7625,
        "max": 880.7625,
        "mean": 880.7625,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 880.7625,
        "sum_squared": 775742.5814062501,
        "min": 880.7625,
        "max": 880.7625,
        "mean": 880.7625,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 240.0,
        "sum_squared": 57600.0,
        "min": 240.0,
        "max": 240.0,
        "mean": 240.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_key_points_recall_score",
          "split": "test"
        },
        "count": 1,
        "sum": 0.8087500000000011,
        "sum_squared": 0.6540765625000018,
        "min": 0.8087500000000011,
        "max": 0.8087500000000011,
        "mean": 0.8087500000000011,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_key_points_recall_score",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8087500000000011,
        "sum_squared": 0.6540765625000018,
        "min": 0.8087500000000011,
        "max": 0.8087500000000011,
        "mean": 0.8087500000000011,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_key_points_recall_score",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8087500000000011,
        "sum_squared": 0.6540765625000018,
        "min": 0.8087500000000011,
        "max": 0.8087500000000011,
        "mean": 0.8087500000000011,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.1.0/call_center_summarization_key_points_recall:model=meta_llama-3-8b-chat",
    "run_spec": {
      "name": "call_center_summarization_key_points_recall:model=meta_llama-3-8b-chat",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationKeyPointsRecallScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Summarize the call transcript in under 10 sentences.",
        "input_prefix": "### Call Transcript\n",
        "input_suffix": "",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 0,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-3-8b-chat",
        "model": "meta/llama-3-8b-chat",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
          "args": {
            "annotator_name": "call_center_summarization_key_points_recall",
            "key": "score"
          }
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "call_center_summarization_key_points_recall"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationKeyPointsRecallAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 10.0,
        "sum_squared": 100.0,
        "min": 10.0,
        "max": 10.0,
        "mean": 10.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 731.5833333333334,
        "sum_squared": 535214.1736111111,
        "min": 731.5833333333334,
        "max": 731.5833333333334,
        "mean": 731.5833333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 174.11666666666667,
        "sum_squared": 30316.613611111115,
        "min": 174.11666666666667,
        "max": 174.11666666666667,
        "mean": 174.11666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 174.11666666666667,
        "sum_squared": 30316.613611111115,
        "min": 174.11666666666667,
        "max": 174.11666666666667,
        "mean": 174.11666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9591338644425075,
        "sum_squared": 0.9199377699204183,
        "min": 0.9591338644425075,
        "max": 0.9591338644425075,
        "mean": 0.9591338644425075,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": -29.165907789174906,
        "sum_squared": 850.6501771666534,
        "min": -29.165907789174906,
        "max": -29.165907789174906,
        "mean": -29.165907789174906,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 174.11666666666667,
        "sum_squared": 30316.613611111115,
        "min": 174.11666666666667,
        "max": 174.11666666666667,
        "mean": 174.11666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 899.4708333333333,
        "sum_squared": 809047.780017361,
        "min": 899.4708333333333,
        "max": 899.4708333333333,
        "mean": 899.4708333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.1823545669237943,
        "sum_squared": 1.397962321925553,
        "min": 1.1823545669237943,
        "max": 1.1823545669237943,
        "mean": 1.1823545669237943,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.04678029455889427,
        "sum_squared": 0.002188395959016913,
        "min": 0.04678029455889427,
        "max": 0.04678029455889427,
        "mean": 0.04678029455889427,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": -0.03242562927926131,
        "sum_squared": 0.0010514214341560886,
        "min": -0.03242562927926131,
        "max": -0.03242562927926131,
        "mean": -0.03242562927926131,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 10.0,
        "sum_squared": 100.0,
        "min": 10.0,
        "max": 10.0,
        "mean": 10.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 10.0,
        "sum_squared": 100.0,
        "min": 10.0,
        "max": 10.0,
        "mean": 10.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 731.5833333333334,
        "sum_squared": 535214.1736111111,
        "min": 731.5833333333334,
        "max": 731.5833333333334,
        "mean": 731.5833333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 731.5833333333334,
        "sum_squared": 535214.1736111111,
        "min": 731.5833333333334,
        "max": 731.5833333333334,
        "mean": 731.5833333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 174.11666666666667,
        "sum_squared": 30316.613611111115,
        "min": 174.11666666666667,
        "max": 174.11666666666667,
        "mean": 174.11666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 174.11666666666667,
        "sum_squared": 30316.613611111115,
        "min": 174.11666666666667,
        "max": 174.11666666666667,
        "mean": 174.11666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 174.11666666666667,
        "sum_squared": 30316.613611111115,
        "min": 174.11666666666667,
        "max": 174.11666666666667,
        "mean": 174.11666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 174.11666666666667,
        "sum_squared": 30316.613611111115,
        "min": 174.11666666666667,
        "max": 174.11666666666667,
        "mean": 174.11666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9591338644425075,
        "sum_squared": 0.9199377699204183,
        "min": 0.9591338644425075,
        "max": 0.9591338644425075,
        "mean": 0.9591338644425075,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9591338644425075,
        "sum_squared": 0.9199377699204183,
        "min": 0.9591338644425075,
        "max": 0.9591338644425075,
        "mean": 0.9591338644425075,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -29.165907789174906,
        "sum_squared": 850.6501771666534,
        "min": -29.165907789174906,
        "max": -29.165907789174906,
        "mean": -29.165907789174906,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -29.165907789174906,
        "sum_squared": 850.6501771666534,
        "min": -29.165907789174906,
        "max": -29.165907789174906,
        "mean": -29.165907789174906,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 174.11666666666667,
        "sum_squared": 30316.613611111115,
        "min": 174.11666666666667,
        "max": 174.11666666666667,
        "mean": 174.11666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 174.11666666666667,
        "sum_squared": 30316.613611111115,
        "min": 174.11666666666667,
        "max": 174.11666666666667,
        "mean": 174.11666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 899.4708333333333,
        "sum_squared": 809047.780017361,
        "min": 899.4708333333333,
        "max": 899.4708333333333,
        "mean": 899.4708333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 899.4708333333333,
        "sum_squared": 809047.780017361,
        "min": 899.4708333333333,
        "max": 899.4708333333333,
        "mean": 899.4708333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 240.0,
        "sum_squared": 57600.0,
        "min": 240.0,
        "max": 240.0,
        "mean": 240.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_key_points_recall_score",
          "split": "test"
        },
        "count": 1,
        "sum": 0.784583333333334,
        "sum_squared": 0.6155710069444454,
        "min": 0.784583333333334,
        "max": 0.784583333333334,
        "mean": 0.784583333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_key_points_recall_score",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.784583333333334,
        "sum_squared": 0.6155710069444454,
        "min": 0.784583333333334,
        "max": 0.784583333333334,
        "mean": 0.784583333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_key_points_recall_score",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.784583333333334,
        "sum_squared": 0.6155710069444454,
        "min": 0.784583333333334,
        "max": 0.784583333333334,
        "mean": 0.784583333333334,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.1.0/call_center_summarization_key_points_recall:model=openai_gpt-4o-mini-2024-07-18",
    "run_spec": {
      "name": "call_center_summarization_key_points_recall:model=openai_gpt-4o-mini-2024-07-18",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationKeyPointsRecallScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Summarize the call transcript in under 10 sentences.",
        "input_prefix": "### Call Transcript\n",
        "input_suffix": "",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 0,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-4o-mini-2024-07-18",
        "model": "openai/gpt-4o-mini-2024-07-18",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
          "args": {
            "annotator_name": "call_center_summarization_key_points_recall",
            "key": "score"
          }
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "call_center_summarization_key_points_recall"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationKeyPointsRecallAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 10.0,
        "sum_squared": 100.0,
        "min": 10.0,
        "max": 10.0,
        "mean": 10.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 728.7625,
        "sum_squared": 531094.7814062501,
        "min": 728.7625,
        "max": 728.7625,
        "mean": 728.7625,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 179.03333333333333,
        "sum_squared": 32052.934444444443,
        "min": 179.03333333333333,
        "max": 179.03333333333333,
        "mean": 179.03333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 179.03333333333333,
        "sum_squared": 32052.934444444443,
        "min": 179.03333333333333,
        "max": 179.03333333333333,
        "mean": 179.03333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0653718024492265,
        "sum_squared": 4.265760682352367,
        "min": 2.0653718024492265,
        "max": 2.0653718024492265,
        "mean": 2.0653718024492265,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 179.03333333333333,
        "sum_squared": 32052.934444444443,
        "min": 179.03333333333333,
        "max": 179.03333333333333,
        "mean": 179.03333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 969.5208333333334,
        "sum_squared": 939970.6462673612,
        "min": 969.5208333333334,
        "max": 969.5208333333334,
        "mean": 969.5208333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 10.0,
        "sum_squared": 100.0,
        "min": 10.0,
        "max": 10.0,
        "mean": 10.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 10.0,
        "sum_squared": 100.0,
        "min": 10.0,
        "max": 10.0,
        "mean": 10.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 728.7625,
        "sum_squared": 531094.7814062501,
        "min": 728.7625,
        "max": 728.7625,
        "mean": 728.7625,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 728.7625,
        "sum_squared": 531094.7814062501,
        "min": 728.7625,
        "max": 728.7625,
        "mean": 728.7625,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 179.03333333333333,
        "sum_squared": 32052.934444444443,
        "min": 179.03333333333333,
        "max": 179.03333333333333,
        "mean": 179.03333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 179.03333333333333,
        "sum_squared": 32052.934444444443,
        "min": 179.03333333333333,
        "max": 179.03333333333333,
        "mean": 179.03333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 179.03333333333333,
        "sum_squared": 32052.934444444443,
        "min": 179.03333333333333,
        "max": 179.03333333333333,
        "mean": 179.03333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 179.03333333333333,
        "sum_squared": 32052.934444444443,
        "min": 179.03333333333333,
        "max": 179.03333333333333,
        "mean": 179.03333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0653718024492265,
        "sum_squared": 4.265760682352367,
        "min": 2.0653718024492265,
        "max": 2.0653718024492265,
        "mean": 2.0653718024492265,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0653718024492265,
        "sum_squared": 4.265760682352367,
        "min": 2.0653718024492265,
        "max": 2.0653718024492265,
        "mean": 2.0653718024492265,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 179.03333333333333,
        "sum_squared": 32052.934444444443,
        "min": 179.03333333333333,
        "max": 179.03333333333333,
        "mean": 179.03333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 179.03333333333333,
        "sum_squared": 32052.934444444443,
        "min": 179.03333333333333,
        "max": 179.03333333333333,
        "mean": 179.03333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 969.5208333333334,
        "sum_squared": 939970.6462673612,
        "min": 969.5208333333334,
        "max": 969.5208333333334,
        "mean": 969.5208333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 969.5208333333334,
        "sum_squared": 939970.6462673612,
        "min": 969.5208333333334,
        "max": 969.5208333333334,
        "mean": 969.5208333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 240.0,
        "sum_squared": 57600.0,
        "min": 240.0,
        "max": 240.0,
        "mean": 240.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_key_points_recall_score",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9044907407407423,
        "sum_squared": 0.8181035000857367,
        "min": 0.9044907407407423,
        "max": 0.9044907407407423,
        "mean": 0.9044907407407423,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_key_points_recall_score",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9044907407407423,
        "sum_squared": 0.8181035000857367,
        "min": 0.9044907407407423,
        "max": 0.9044907407407423,
        "mean": 0.9044907407407423,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_key_points_recall_score",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9044907407407423,
        "sum_squared": 0.8181035000857367,
        "min": 0.9044907407407423,
        "max": 0.9044907407407423,
        "mean": 0.9044907407407423,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.1.0/call_center_summarization_pairwise_comparison:model=anthropic_claude-3-5-sonnet-20240620",
    "run_spec": {
      "name": "call_center_summarization_pairwise_comparison:model=anthropic_claude-3-5-sonnet-20240620",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationPairwiseComparisonScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Summarize the call transcript in under 10 sentences.",
        "input_prefix": "### Call Transcript\n",
        "input_suffix": "",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 0,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-3-5-sonnet-20240620",
        "model": "anthropic/claude-3-5-sonnet-20240620",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
          "args": {
            "annotator_name": "call_center_summarization_pairwise_comparison",
            "key": "score"
          }
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "call_center_summarization_pairwise_comparison"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationPairwiseComparisonAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 801.1083333333333,
        "sum_squared": 641774.5617361112,
        "min": 801.1083333333333,
        "max": 801.1083333333333,
        "mean": 801.1083333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 151.10833333333332,
        "sum_squared": 22833.728402777775,
        "min": 151.10833333333332,
        "max": 151.10833333333332,
        "mean": 151.10833333333332,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 151.10833333333332,
        "sum_squared": 22833.728402777775,
        "min": 151.10833333333332,
        "max": 151.10833333333332,
        "mean": 151.10833333333332,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 3.434149192770322,
        "sum_squared": 11.793380678205054,
        "min": 3.434149192770322,
        "max": 3.434149192770322,
        "mean": 3.434149192770322,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 151.10833333333332,
        "sum_squared": 22833.728402777775,
        "min": 151.10833333333332,
        "max": 151.10833333333332,
        "mean": 151.10833333333332,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 790.0666666666667,
        "sum_squared": 624205.3377777778,
        "min": 790.0666666666667,
        "max": 790.0666666666667,
        "mean": 790.0666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 801.1083333333333,
        "sum_squared": 641774.5617361112,
        "min": 801.1083333333333,
        "max": 801.1083333333333,
        "mean": 801.1083333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 801.1083333333333,
        "sum_squared": 641774.5617361112,
        "min": 801.1083333333333,
        "max": 801.1083333333333,
        "mean": 801.1083333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 151.10833333333332,
        "sum_squared": 22833.728402777775,
        "min": 151.10833333333332,
        "max": 151.10833333333332,
        "mean": 151.10833333333332,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 151.10833333333332,
        "sum_squared": 22833.728402777775,
        "min": 151.10833333333332,
        "max": 151.10833333333332,
        "mean": 151.10833333333332,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 151.10833333333332,
        "sum_squared": 22833.728402777775,
        "min": 151.10833333333332,
        "max": 151.10833333333332,
        "mean": 151.10833333333332,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 151.10833333333332,
        "sum_squared": 22833.728402777775,
        "min": 151.10833333333332,
        "max": 151.10833333333332,
        "mean": 151.10833333333332,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.434149192770322,
        "sum_squared": 11.793380678205054,
        "min": 3.434149192770322,
        "max": 3.434149192770322,
        "mean": 3.434149192770322,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.434149192770322,
        "sum_squared": 11.793380678205054,
        "min": 3.434149192770322,
        "max": 3.434149192770322,
        "mean": 3.434149192770322,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 151.10833333333332,
        "sum_squared": 22833.728402777775,
        "min": 151.10833333333332,
        "max": 151.10833333333332,
        "mean": 151.10833333333332,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 151.10833333333332,
        "sum_squared": 22833.728402777775,
        "min": 151.10833333333332,
        "max": 151.10833333333332,
        "mean": 151.10833333333332,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 790.0666666666667,
        "sum_squared": 624205.3377777778,
        "min": 790.0666666666667,
        "max": 790.0666666666667,
        "mean": 790.0666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 790.0666666666667,
        "sum_squared": 624205.3377777778,
        "min": 790.0666666666667,
        "max": 790.0666666666667,
        "mean": 790.0666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 240.0,
        "sum_squared": 57600.0,
        "min": 240.0,
        "max": 240.0,
        "mean": 240.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_pairwise_comparison_score",
          "split": "test"
        },
        "count": 1,
        "sum": 0.8958333333333334,
        "sum_squared": 0.8025173611111112,
        "min": 0.8958333333333334,
        "max": 0.8958333333333334,
        "mean": 0.8958333333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_pairwise_comparison_score",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8958333333333334,
        "sum_squared": 0.8025173611111112,
        "min": 0.8958333333333334,
        "max": 0.8958333333333334,
        "mean": 0.8958333333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_pairwise_comparison_score",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8958333333333334,
        "sum_squared": 0.8025173611111112,
        "min": 0.8958333333333334,
        "max": 0.8958333333333334,
        "mean": 0.8958333333333334,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.1.0/call_center_summarization_pairwise_comparison:model=meta_llama-3-70b-chat",
    "run_spec": {
      "name": "call_center_summarization_pairwise_comparison:model=meta_llama-3-70b-chat",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationPairwiseComparisonScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Summarize the call transcript in under 10 sentences.",
        "input_prefix": "### Call Transcript\n",
        "input_suffix": "",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 0,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-3-70b-chat",
        "model": "meta/llama-3-70b-chat",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
          "args": {
            "annotator_name": "call_center_summarization_pairwise_comparison",
            "key": "score"
          }
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "call_center_summarization_pairwise_comparison"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationPairwiseComparisonAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 731.5833333333334,
        "sum_squared": 535214.1736111111,
        "min": 731.5833333333334,
        "max": 731.5833333333334,
        "mean": 731.5833333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 169.7125,
        "sum_squared": 28802.33265625,
        "min": 169.7125,
        "max": 169.7125,
        "mean": 169.7125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 169.7125,
        "sum_squared": 28802.33265625,
        "min": 169.7125,
        "max": 169.7125,
        "mean": 169.7125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.58345062037309,
        "sum_squared": 2.5073158671599236,
        "min": 1.58345062037309,
        "max": 1.58345062037309,
        "mean": 1.58345062037309,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": -15.858516478547493,
        "sum_squared": 251.4925449003624,
        "min": -15.858516478547493,
        "max": -15.858516478547493,
        "mean": -15.858516478547493,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 169.7125,
        "sum_squared": 28802.33265625,
        "min": 169.7125,
        "max": 169.7125,
        "mean": 169.7125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 880.7625,
        "sum_squared": 775742.5814062501,
        "min": 880.7625,
        "max": 880.7625,
        "mean": 880.7625,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0979484800586545,
        "sum_squared": 1.2054908648631097,
        "min": 1.0979484800586545,
        "max": 1.0979484800586545,
        "mean": 1.0979484800586545,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.025976359210861468,
        "sum_squared": 0.0006747712378517074,
        "min": 0.025976359210861468,
        "max": 0.025976359210861468,
        "mean": 0.025976359210861468,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": -0.018005440148220992,
        "sum_squared": 0.00032419587493116836,
        "min": -0.018005440148220992,
        "max": -0.018005440148220992,
        "mean": -0.018005440148220992,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 731.5833333333334,
        "sum_squared": 535214.1736111111,
        "min": 731.5833333333334,
        "max": 731.5833333333334,
        "mean": 731.5833333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 731.5833333333334,
        "sum_squared": 535214.1736111111,
        "min": 731.5833333333334,
        "max": 731.5833333333334,
        "mean": 731.5833333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.7125,
        "sum_squared": 28802.33265625,
        "min": 169.7125,
        "max": 169.7125,
        "mean": 169.7125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.7125,
        "sum_squared": 28802.33265625,
        "min": 169.7125,
        "max": 169.7125,
        "mean": 169.7125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.7125,
        "sum_squared": 28802.33265625,
        "min": 169.7125,
        "max": 169.7125,
        "mean": 169.7125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.7125,
        "sum_squared": 28802.33265625,
        "min": 169.7125,
        "max": 169.7125,
        "mean": 169.7125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.58345062037309,
        "sum_squared": 2.5073158671599236,
        "min": 1.58345062037309,
        "max": 1.58345062037309,
        "mean": 1.58345062037309,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.58345062037309,
        "sum_squared": 2.5073158671599236,
        "min": 1.58345062037309,
        "max": 1.58345062037309,
        "mean": 1.58345062037309,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -15.858516478547493,
        "sum_squared": 251.4925449003624,
        "min": -15.858516478547493,
        "max": -15.858516478547493,
        "mean": -15.858516478547493,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -15.858516478547493,
        "sum_squared": 251.4925449003624,
        "min": -15.858516478547493,
        "max": -15.858516478547493,
        "mean": -15.858516478547493,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.7125,
        "sum_squared": 28802.33265625,
        "min": 169.7125,
        "max": 169.7125,
        "mean": 169.7125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 169.7125,
        "sum_squared": 28802.33265625,
        "min": 169.7125,
        "max": 169.7125,
        "mean": 169.7125,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 880.7625,
        "sum_squared": 775742.5814062501,
        "min": 880.7625,
        "max": 880.7625,
        "mean": 880.7625,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 880.7625,
        "sum_squared": 775742.5814062501,
        "min": 880.7625,
        "max": 880.7625,
        "mean": 880.7625,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 240.0,
        "sum_squared": 57600.0,
        "min": 240.0,
        "max": 240.0,
        "mean": 240.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_pairwise_comparison_score",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9333333333333333,
        "sum_squared": 0.8711111111111112,
        "min": 0.9333333333333333,
        "max": 0.9333333333333333,
        "mean": 0.9333333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_pairwise_comparison_score",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9333333333333333,
        "sum_squared": 0.8711111111111112,
        "min": 0.9333333333333333,
        "max": 0.9333333333333333,
        "mean": 0.9333333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_pairwise_comparison_score",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9333333333333333,
        "sum_squared": 0.8711111111111112,
        "min": 0.9333333333333333,
        "max": 0.9333333333333333,
        "mean": 0.9333333333333333,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.1.0/call_center_summarization_pairwise_comparison:model=meta_llama-3-8b-chat",
    "run_spec": {
      "name": "call_center_summarization_pairwise_comparison:model=meta_llama-3-8b-chat",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationPairwiseComparisonScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Summarize the call transcript in under 10 sentences.",
        "input_prefix": "### Call Transcript\n",
        "input_suffix": "",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 0,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-3-8b-chat",
        "model": "meta/llama-3-8b-chat",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
          "args": {
            "annotator_name": "call_center_summarization_pairwise_comparison",
            "key": "score"
          }
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "call_center_summarization_pairwise_comparison"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationPairwiseComparisonAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 731.5833333333334,
        "sum_squared": 535214.1736111111,
        "min": 731.5833333333334,
        "max": 731.5833333333334,
        "mean": 731.5833333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 174.11666666666667,
        "sum_squared": 30316.613611111115,
        "min": 174.11666666666667,
        "max": 174.11666666666667,
        "mean": 174.11666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 174.11666666666667,
        "sum_squared": 30316.613611111115,
        "min": 174.11666666666667,
        "max": 174.11666666666667,
        "mean": 174.11666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9591338644425075,
        "sum_squared": 0.9199377699204183,
        "min": 0.9591338644425075,
        "max": 0.9591338644425075,
        "mean": 0.9591338644425075,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": -29.165907789174906,
        "sum_squared": 850.6501771666534,
        "min": -29.165907789174906,
        "max": -29.165907789174906,
        "mean": -29.165907789174906,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 174.11666666666667,
        "sum_squared": 30316.613611111115,
        "min": 174.11666666666667,
        "max": 174.11666666666667,
        "mean": 174.11666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 899.4708333333333,
        "sum_squared": 809047.780017361,
        "min": 899.4708333333333,
        "max": 899.4708333333333,
        "mean": 899.4708333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.1823545669237943,
        "sum_squared": 1.397962321925553,
        "min": 1.1823545669237943,
        "max": 1.1823545669237943,
        "mean": 1.1823545669237943,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.04678029455889427,
        "sum_squared": 0.002188395959016913,
        "min": 0.04678029455889427,
        "max": 0.04678029455889427,
        "mean": 0.04678029455889427,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": -0.03242562927926131,
        "sum_squared": 0.0010514214341560886,
        "min": -0.03242562927926131,
        "max": -0.03242562927926131,
        "mean": -0.03242562927926131,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 731.5833333333334,
        "sum_squared": 535214.1736111111,
        "min": 731.5833333333334,
        "max": 731.5833333333334,
        "mean": 731.5833333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 731.5833333333334,
        "sum_squared": 535214.1736111111,
        "min": 731.5833333333334,
        "max": 731.5833333333334,
        "mean": 731.5833333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 174.11666666666667,
        "sum_squared": 30316.613611111115,
        "min": 174.11666666666667,
        "max": 174.11666666666667,
        "mean": 174.11666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 174.11666666666667,
        "sum_squared": 30316.613611111115,
        "min": 174.11666666666667,
        "max": 174.11666666666667,
        "mean": 174.11666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 174.11666666666667,
        "sum_squared": 30316.613611111115,
        "min": 174.11666666666667,
        "max": 174.11666666666667,
        "mean": 174.11666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 174.11666666666667,
        "sum_squared": 30316.613611111115,
        "min": 174.11666666666667,
        "max": 174.11666666666667,
        "mean": 174.11666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9591338644425075,
        "sum_squared": 0.9199377699204183,
        "min": 0.9591338644425075,
        "max": 0.9591338644425075,
        "mean": 0.9591338644425075,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9591338644425075,
        "sum_squared": 0.9199377699204183,
        "min": 0.9591338644425075,
        "max": 0.9591338644425075,
        "mean": 0.9591338644425075,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -29.165907789174906,
        "sum_squared": 850.6501771666534,
        "min": -29.165907789174906,
        "max": -29.165907789174906,
        "mean": -29.165907789174906,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -29.165907789174906,
        "sum_squared": 850.6501771666534,
        "min": -29.165907789174906,
        "max": -29.165907789174906,
        "mean": -29.165907789174906,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 174.11666666666667,
        "sum_squared": 30316.613611111115,
        "min": 174.11666666666667,
        "max": 174.11666666666667,
        "mean": 174.11666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 174.11666666666667,
        "sum_squared": 30316.613611111115,
        "min": 174.11666666666667,
        "max": 174.11666666666667,
        "mean": 174.11666666666667,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 899.4708333333333,
        "sum_squared": 809047.780017361,
        "min": 899.4708333333333,
        "max": 899.4708333333333,
        "mean": 899.4708333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 899.4708333333333,
        "sum_squared": 809047.780017361,
        "min": 899.4708333333333,
        "max": 899.4708333333333,
        "mean": 899.4708333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 240.0,
        "sum_squared": 57600.0,
        "min": 240.0,
        "max": 240.0,
        "mean": 240.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_pairwise_comparison_score",
          "split": "test"
        },
        "count": 1,
        "sum": 0.9708333333333333,
        "sum_squared": 0.9425173611111111,
        "min": 0.9708333333333333,
        "max": 0.9708333333333333,
        "mean": 0.9708333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_pairwise_comparison_score",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9708333333333333,
        "sum_squared": 0.9425173611111111,
        "min": 0.9708333333333333,
        "max": 0.9708333333333333,
        "mean": 0.9708333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_pairwise_comparison_score",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.9708333333333333,
        "sum_squared": 0.9425173611111111,
        "min": 0.9708333333333333,
        "max": 0.9708333333333333,
        "mean": 0.9708333333333333,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.1.0/call_center_summarization_pairwise_comparison:model=openai_gpt-4o-2024-05-13",
    "run_spec": {
      "name": "call_center_summarization_pairwise_comparison:model=openai_gpt-4o-2024-05-13",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationPairwiseComparisonScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Summarize the call transcript in under 10 sentences.",
        "input_prefix": "### Call Transcript\n",
        "input_suffix": "",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 0,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-4o-2024-05-13",
        "model": "openai/gpt-4o-2024-05-13",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
          "args": {
            "annotator_name": "call_center_summarization_pairwise_comparison",
            "key": "score"
          }
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "call_center_summarization_pairwise_comparison"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationPairwiseComparisonAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 728.7625,
        "sum_squared": 531094.7814062501,
        "min": 728.7625,
        "max": 728.7625,
        "mean": 728.7625,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 154.72916666666666,
        "sum_squared": 23941.11501736111,
        "min": 154.72916666666666,
        "max": 154.72916666666666,
        "mean": 154.72916666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 154.72916666666666,
        "sum_squared": 23941.11501736111,
        "min": 154.72916666666666,
        "max": 154.72916666666666,
        "mean": 154.72916666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 2.2198004086812335,
        "sum_squared": 4.927513854381371,
        "min": 2.2198004086812335,
        "max": 2.2198004086812335,
        "mean": 2.2198004086812335,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 154.72916666666666,
        "sum_squared": 23941.11501736111,
        "min": 154.72916666666666,
        "max": 154.72916666666666,
        "mean": 154.72916666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 836.2,
        "sum_squared": 699230.4400000001,
        "min": 836.2,
        "max": 836.2,
        "mean": 836.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 728.7625,
        "sum_squared": 531094.7814062501,
        "min": 728.7625,
        "max": 728.7625,
        "mean": 728.7625,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 728.7625,
        "sum_squared": 531094.7814062501,
        "min": 728.7625,
        "max": 728.7625,
        "mean": 728.7625,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 154.72916666666666,
        "sum_squared": 23941.11501736111,
        "min": 154.72916666666666,
        "max": 154.72916666666666,
        "mean": 154.72916666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 154.72916666666666,
        "sum_squared": 23941.11501736111,
        "min": 154.72916666666666,
        "max": 154.72916666666666,
        "mean": 154.72916666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 154.72916666666666,
        "sum_squared": 23941.11501736111,
        "min": 154.72916666666666,
        "max": 154.72916666666666,
        "mean": 154.72916666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 154.72916666666666,
        "sum_squared": 23941.11501736111,
        "min": 154.72916666666666,
        "max": 154.72916666666666,
        "mean": 154.72916666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.2198004086812335,
        "sum_squared": 4.927513854381371,
        "min": 2.2198004086812335,
        "max": 2.2198004086812335,
        "mean": 2.2198004086812335,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.2198004086812335,
        "sum_squared": 4.927513854381371,
        "min": 2.2198004086812335,
        "max": 2.2198004086812335,
        "mean": 2.2198004086812335,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 154.72916666666666,
        "sum_squared": 23941.11501736111,
        "min": 154.72916666666666,
        "max": 154.72916666666666,
        "mean": 154.72916666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 154.72916666666666,
        "sum_squared": 23941.11501736111,
        "min": 154.72916666666666,
        "max": 154.72916666666666,
        "mean": 154.72916666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 836.2,
        "sum_squared": 699230.4400000001,
        "min": 836.2,
        "max": 836.2,
        "mean": 836.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 836.2,
        "sum_squared": 699230.4400000001,
        "min": 836.2,
        "max": 836.2,
        "mean": 836.2,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 240.0,
        "sum_squared": 57600.0,
        "min": 240.0,
        "max": 240.0,
        "mean": 240.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_pairwise_comparison_score",
          "split": "test"
        },
        "count": 1,
        "sum": 0.6375,
        "sum_squared": 0.40640624999999997,
        "min": 0.6375,
        "max": 0.6375,
        "mean": 0.6375,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_pairwise_comparison_score",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6375,
        "sum_squared": 0.40640624999999997,
        "min": 0.6375,
        "max": 0.6375,
        "mean": 0.6375,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_pairwise_comparison_score",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.6375,
        "sum_squared": 0.40640624999999997,
        "min": 0.6375,
        "max": 0.6375,
        "mean": 0.6375,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.1.0/call_center_summarization_pairwise_comparison:model=openai_gpt-4o-mini-2024-07-18",
    "run_spec": {
      "name": "call_center_summarization_pairwise_comparison:model=openai_gpt-4o-mini-2024-07-18",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationPairwiseComparisonScenario",
        "args": {}
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Summarize the call transcript in under 10 sentences.",
        "input_prefix": "### Call Transcript\n",
        "input_suffix": "",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "output_prefix": "",
        "output_suffix": "",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 0,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-4o-mini-2024-07-18",
        "model": "openai/gpt-4o-mini-2024-07-18",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
          "args": {
            "annotator_name": "call_center_summarization_pairwise_comparison",
            "key": "score"
          }
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "call_center_summarization_pairwise_comparison"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationPairwiseComparisonAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 728.7625,
        "sum_squared": 531094.7814062501,
        "min": 728.7625,
        "max": 728.7625,
        "mean": 728.7625,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 179.03333333333333,
        "sum_squared": 32052.934444444443,
        "min": 179.03333333333333,
        "max": 179.03333333333333,
        "mean": 179.03333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 179.03333333333333,
        "sum_squared": 32052.934444444443,
        "min": 179.03333333333333,
        "max": 179.03333333333333,
        "mean": 179.03333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 2.0653718024492265,
        "sum_squared": 4.265760682352367,
        "min": 2.0653718024492265,
        "max": 2.0653718024492265,
        "mean": 2.0653718024492265,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 179.03333333333333,
        "sum_squared": 32052.934444444443,
        "min": 179.03333333333333,
        "max": 179.03333333333333,
        "mean": 179.03333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 969.5208333333334,
        "sum_squared": 939970.6462673612,
        "min": 969.5208333333334,
        "max": 969.5208333333334,
        "mean": 969.5208333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 728.7625,
        "sum_squared": 531094.7814062501,
        "min": 728.7625,
        "max": 728.7625,
        "mean": 728.7625,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 728.7625,
        "sum_squared": 531094.7814062501,
        "min": 728.7625,
        "max": 728.7625,
        "mean": 728.7625,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 179.03333333333333,
        "sum_squared": 32052.934444444443,
        "min": 179.03333333333333,
        "max": 179.03333333333333,
        "mean": 179.03333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 179.03333333333333,
        "sum_squared": 32052.934444444443,
        "min": 179.03333333333333,
        "max": 179.03333333333333,
        "mean": 179.03333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 179.03333333333333,
        "sum_squared": 32052.934444444443,
        "min": 179.03333333333333,
        "max": 179.03333333333333,
        "mean": 179.03333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 179.03333333333333,
        "sum_squared": 32052.934444444443,
        "min": 179.03333333333333,
        "max": 179.03333333333333,
        "mean": 179.03333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0653718024492265,
        "sum_squared": 4.265760682352367,
        "min": 2.0653718024492265,
        "max": 2.0653718024492265,
        "mean": 2.0653718024492265,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.0653718024492265,
        "sum_squared": 4.265760682352367,
        "min": 2.0653718024492265,
        "max": 2.0653718024492265,
        "mean": 2.0653718024492265,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 179.03333333333333,
        "sum_squared": 32052.934444444443,
        "min": 179.03333333333333,
        "max": 179.03333333333333,
        "mean": 179.03333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 179.03333333333333,
        "sum_squared": 32052.934444444443,
        "min": 179.03333333333333,
        "max": 179.03333333333333,
        "mean": 179.03333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 969.5208333333334,
        "sum_squared": 939970.6462673612,
        "min": 969.5208333333334,
        "max": 969.5208333333334,
        "mean": 969.5208333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 969.5208333333334,
        "sum_squared": 939970.6462673612,
        "min": 969.5208333333334,
        "max": 969.5208333333334,
        "mean": 969.5208333333334,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 240.0,
        "sum_squared": 57600.0,
        "min": 240.0,
        "max": 240.0,
        "mean": 240.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_pairwise_comparison_score",
          "split": "test"
        },
        "count": 1,
        "sum": 0.4083333333333333,
        "sum_squared": 0.1667361111111111,
        "min": 0.4083333333333333,
        "max": 0.4083333333333333,
        "mean": 0.4083333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_pairwise_comparison_score",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4083333333333333,
        "sum_squared": 0.1667361111111111,
        "min": 0.4083333333333333,
        "max": 0.4083333333333333,
        "mean": 0.4083333333333333,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "annotation_call_center_summarization_pairwise_comparison_score",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.4083333333333333,
        "sum_squared": 0.1667361111111111,
        "min": 0.4083333333333333,
        "max": 0.4083333333333333,
        "mean": 0.4083333333333333,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  }
]