[
  {
    "name": "call_center_summarization:model=anthropic_claude-3-5-sonnet-20240620",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario",
      "args": {
        "revision": "main"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Summarize the call transcript in under 10 sentences.",
      "input_prefix": "### Call Transcript\n",
      "input_suffix": "",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "",
      "output_suffix": "",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 0,
      "max_eval_instances": 1000,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-3-5-sonnet-20240620",
      "model": "anthropic/claude-3-5-sonnet-20240620",
      "temperature": 0.0,
      "max_tokens": 512,
      "stop_sequences": [],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": []
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "faithfulness",
          "min_score": 1,
          "max_score": 5
        }
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "relevance",
          "min_score": 1,
          "max_score": 5
        }
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "coherence",
          "min_score": 1,
          "max_score": 5
        }
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "call_center_summarization"
    ],
    "annotators": [
      {
        "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator",
        "args": {}
      }
    ]
  },
  {
    "name": "call_center_summarization:model=meta_llama-3-70b-chat",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario",
      "args": {
        "revision": "main"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Summarize the call transcript in under 10 sentences.",
      "input_prefix": "### Call Transcript\n",
      "input_suffix": "",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "",
      "output_suffix": "",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 0,
      "max_eval_instances": 1000,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/llama-3-70b-chat",
      "model": "meta/llama-3-70b-chat",
      "temperature": 0.0,
      "max_tokens": 512,
      "stop_sequences": [],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": []
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "faithfulness",
          "min_score": 1,
          "max_score": 5
        }
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "relevance",
          "min_score": 1,
          "max_score": 5
        }
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "coherence",
          "min_score": 1,
          "max_score": 5
        }
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "call_center_summarization"
    ],
    "annotators": [
      {
        "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator",
        "args": {}
      }
    ]
  },
  {
    "name": "call_center_summarization:model=meta_llama-3-8b-chat",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario",
      "args": {
        "revision": "main"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Summarize the call transcript in under 10 sentences.",
      "input_prefix": "### Call Transcript\n",
      "input_suffix": "",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "",
      "output_suffix": "",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 0,
      "max_eval_instances": 1000,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/llama-3-8b-chat",
      "model": "meta/llama-3-8b-chat",
      "temperature": 0.0,
      "max_tokens": 512,
      "stop_sequences": [],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": []
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "faithfulness",
          "min_score": 1,
          "max_score": 5
        }
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "relevance",
          "min_score": 1,
          "max_score": 5
        }
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "coherence",
          "min_score": 1,
          "max_score": 5
        }
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "call_center_summarization"
    ],
    "annotators": [
      {
        "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator",
        "args": {}
      }
    ]
  },
  {
    "name": "call_center_summarization:model=openai_gpt-4o-2024-05-13",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario",
      "args": {
        "revision": "main"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Summarize the call transcript in under 10 sentences.",
      "input_prefix": "### Call Transcript\n",
      "input_suffix": "",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "",
      "output_suffix": "",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 0,
      "max_eval_instances": 1000,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "openai/gpt-4o-2024-05-13",
      "model": "openai/gpt-4o-2024-05-13",
      "temperature": 0.0,
      "max_tokens": 512,
      "stop_sequences": [],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": []
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "faithfulness",
          "min_score": 1,
          "max_score": 5
        }
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "relevance",
          "min_score": 1,
          "max_score": 5
        }
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "coherence",
          "min_score": 1,
          "max_score": 5
        }
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "call_center_summarization"
    ],
    "annotators": [
      {
        "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator",
        "args": {}
      }
    ]
  },
  {
    "name": "call_center_summarization:model=openai_gpt-4o-mini-2024-07-18",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario",
      "args": {
        "revision": "main"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Summarize the call transcript in under 10 sentences.",
      "input_prefix": "### Call Transcript\n",
      "input_suffix": "",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "",
      "output_suffix": "",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 0,
      "max_eval_instances": 1000,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "openai/gpt-4o-mini-2024-07-18",
      "model": "openai/gpt-4o-mini-2024-07-18",
      "temperature": 0.0,
      "max_tokens": 512,
      "stop_sequences": [],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": []
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "faithfulness",
          "min_score": 1,
          "max_score": 5
        }
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "relevance",
          "min_score": 1,
          "max_score": 5
        }
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "coherence",
          "min_score": 1,
          "max_score": 5
        }
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "call_center_summarization"
    ],
    "annotators": [
      {
        "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator",
        "args": {}
      }
    ]
  },
  {
    "name": "call_center_summarization:subset=real_call_transcripts,model=anthropic_claude-3-5-sonnet-20240620",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario",
      "args": {
        "subset": "real_call_transcripts"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Summarize the call transcript in under 10 sentences.",
      "input_prefix": "### Call Transcript\n",
      "input_suffix": "",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "",
      "output_suffix": "",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 0,
      "max_eval_instances": 1000,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-3-5-sonnet-20240620",
      "model": "anthropic/claude-3-5-sonnet-20240620",
      "temperature": 0.0,
      "max_tokens": 512,
      "stop_sequences": [],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": []
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "faithfulness",
          "min_score": 1,
          "max_score": 5
        }
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "relevance",
          "min_score": 1,
          "max_score": 5
        }
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "coherence",
          "min_score": 1,
          "max_score": 5
        }
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "call_center_summarization_real_call_transcripts"
    ],
    "annotators": [
      {
        "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator",
        "args": {}
      }
    ]
  },
  {
    "name": "call_center_summarization:subset=real_call_transcripts,model=meta_llama-3-70b-chat",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario",
      "args": {
        "subset": "real_call_transcripts"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Summarize the call transcript in under 10 sentences.",
      "input_prefix": "### Call Transcript\n",
      "input_suffix": "",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "",
      "output_suffix": "",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 0,
      "max_eval_instances": 1000,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/llama-3-70b-chat",
      "model": "meta/llama-3-70b-chat",
      "temperature": 0.0,
      "max_tokens": 512,
      "stop_sequences": [],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": []
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "faithfulness",
          "min_score": 1,
          "max_score": 5
        }
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "relevance",
          "min_score": 1,
          "max_score": 5
        }
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "coherence",
          "min_score": 1,
          "max_score": 5
        }
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "call_center_summarization_real_call_transcripts"
    ],
    "annotators": [
      {
        "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator",
        "args": {}
      }
    ]
  },
  {
    "name": "call_center_summarization:subset=real_call_transcripts,model=meta_llama-3-8b-chat",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario",
      "args": {
        "subset": "real_call_transcripts"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Summarize the call transcript in under 10 sentences.",
      "input_prefix": "### Call Transcript\n",
      "input_suffix": "",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "",
      "output_suffix": "",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 0,
      "max_eval_instances": 1000,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/llama-3-8b-chat",
      "model": "meta/llama-3-8b-chat",
      "temperature": 0.0,
      "max_tokens": 512,
      "stop_sequences": [],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": []
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "faithfulness",
          "min_score": 1,
          "max_score": 5
        }
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "relevance",
          "min_score": 1,
          "max_score": 5
        }
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "coherence",
          "min_score": 1,
          "max_score": 5
        }
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "call_center_summarization_real_call_transcripts"
    ],
    "annotators": [
      {
        "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator",
        "args": {}
      }
    ]
  },
  {
    "name": "call_center_summarization:subset=real_call_transcripts,model=openai_gpt-4o-2024-05-13",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario",
      "args": {
        "subset": "real_call_transcripts"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Summarize the call transcript in under 10 sentences.",
      "input_prefix": "### Call Transcript\n",
      "input_suffix": "",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "",
      "output_suffix": "",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 0,
      "max_eval_instances": 1000,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "openai/gpt-4o-2024-05-13",
      "model": "openai/gpt-4o-2024-05-13",
      "temperature": 0.0,
      "max_tokens": 512,
      "stop_sequences": [],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": []
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "faithfulness",
          "min_score": 1,
          "max_score": 5
        }
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "relevance",
          "min_score": 1,
          "max_score": 5
        }
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "coherence",
          "min_score": 1,
          "max_score": 5
        }
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "call_center_summarization_real_call_transcripts"
    ],
    "annotators": [
      {
        "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator",
        "args": {}
      }
    ]
  },
  {
    "name": "call_center_summarization:subset=real_call_transcripts,model=openai_gpt-4o-mini-2024-07-18",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationScenario",
      "args": {
        "subset": "real_call_transcripts"
      }
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Summarize the call transcript in under 10 sentences.",
      "input_prefix": "### Call Transcript\n",
      "input_suffix": "",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "",
      "output_suffix": "",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 0,
      "max_eval_instances": 1000,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "openai/gpt-4o-mini-2024-07-18",
      "model": "openai/gpt-4o-mini-2024-07-18",
      "temperature": 0.0,
      "max_tokens": 512,
      "stop_sequences": [],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": []
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "faithfulness",
          "min_score": 1,
          "max_score": 5
        }
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "relevance",
          "min_score": 1,
          "max_score": 5
        }
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationLikertScaleMetric",
        "args": {
          "annotator_name": "call_center_summarization",
          "key": "coherence",
          "min_score": 1,
          "max_score": 5
        }
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "call_center_summarization_real_call_transcripts"
    ],
    "annotators": [
      {
        "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationAnnotator",
        "args": {}
      }
    ]
  },
  {
    "name": "call_center_summarization_key_points_recall:model=anthropic_claude-3-5-sonnet-20240620",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationKeyPointsRecallScenario",
      "args": {}
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Summarize the call transcript in under 10 sentences.",
      "input_prefix": "### Call Transcript\n",
      "input_suffix": "",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "",
      "output_suffix": "",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 0,
      "max_eval_instances": 1000,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-3-5-sonnet-20240620",
      "model": "anthropic/claude-3-5-sonnet-20240620",
      "temperature": 0.0,
      "max_tokens": 512,
      "stop_sequences": [],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": []
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
        "args": {
          "annotator_name": "call_center_summarization_key_points_recall",
          "key": "score"
        }
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "call_center_summarization_key_points_recall"
    ],
    "annotators": [
      {
        "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationKeyPointsRecallAnnotator",
        "args": {}
      }
    ]
  },
  {
    "name": "call_center_summarization_key_points_recall:model=meta_llama-3-70b-chat",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationKeyPointsRecallScenario",
      "args": {}
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Summarize the call transcript in under 10 sentences.",
      "input_prefix": "### Call Transcript\n",
      "input_suffix": "",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "",
      "output_suffix": "",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 0,
      "max_eval_instances": 1000,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/llama-3-70b-chat",
      "model": "meta/llama-3-70b-chat",
      "temperature": 0.0,
      "max_tokens": 512,
      "stop_sequences": [],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": []
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
        "args": {
          "annotator_name": "call_center_summarization_key_points_recall",
          "key": "score"
        }
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "call_center_summarization_key_points_recall"
    ],
    "annotators": [
      {
        "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationKeyPointsRecallAnnotator",
        "args": {}
      }
    ]
  },
  {
    "name": "call_center_summarization_key_points_recall:model=meta_llama-3-8b-chat",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationKeyPointsRecallScenario",
      "args": {}
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Summarize the call transcript in under 10 sentences.",
      "input_prefix": "### Call Transcript\n",
      "input_suffix": "",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "",
      "output_suffix": "",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 0,
      "max_eval_instances": 1000,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/llama-3-8b-chat",
      "model": "meta/llama-3-8b-chat",
      "temperature": 0.0,
      "max_tokens": 512,
      "stop_sequences": [],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": []
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
        "args": {
          "annotator_name": "call_center_summarization_key_points_recall",
          "key": "score"
        }
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "call_center_summarization_key_points_recall"
    ],
    "annotators": [
      {
        "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationKeyPointsRecallAnnotator",
        "args": {}
      }
    ]
  },
  {
    "name": "call_center_summarization_key_points_recall:model=openai_gpt-4o-mini-2024-07-18",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationKeyPointsRecallScenario",
      "args": {}
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Summarize the call transcript in under 10 sentences.",
      "input_prefix": "### Call Transcript\n",
      "input_suffix": "",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "",
      "output_suffix": "",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 0,
      "max_eval_instances": 1000,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "openai/gpt-4o-mini-2024-07-18",
      "model": "openai/gpt-4o-mini-2024-07-18",
      "temperature": 0.0,
      "max_tokens": 512,
      "stop_sequences": [],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": []
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
        "args": {
          "annotator_name": "call_center_summarization_key_points_recall",
          "key": "score"
        }
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "call_center_summarization_key_points_recall"
    ],
    "annotators": [
      {
        "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationKeyPointsRecallAnnotator",
        "args": {}
      }
    ]
  },
  {
    "name": "call_center_summarization_pairwise_comparison:model=anthropic_claude-3-5-sonnet-20240620",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationPairwiseComparisonScenario",
      "args": {}
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Summarize the call transcript in under 10 sentences.",
      "input_prefix": "### Call Transcript\n",
      "input_suffix": "",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "",
      "output_suffix": "",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 0,
      "max_eval_instances": 1000,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "anthropic/claude-3-5-sonnet-20240620",
      "model": "anthropic/claude-3-5-sonnet-20240620",
      "temperature": 0.0,
      "max_tokens": 512,
      "stop_sequences": [],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": []
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
        "args": {
          "annotator_name": "call_center_summarization_pairwise_comparison",
          "key": "score"
        }
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "call_center_summarization_pairwise_comparison"
    ],
    "annotators": [
      {
        "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationPairwiseComparisonAnnotator",
        "args": {}
      }
    ]
  },
  {
    "name": "call_center_summarization_pairwise_comparison:model=meta_llama-3-70b-chat",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationPairwiseComparisonScenario",
      "args": {}
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Summarize the call transcript in under 10 sentences.",
      "input_prefix": "### Call Transcript\n",
      "input_suffix": "",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "",
      "output_suffix": "",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 0,
      "max_eval_instances": 1000,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/llama-3-70b-chat",
      "model": "meta/llama-3-70b-chat",
      "temperature": 0.0,
      "max_tokens": 512,
      "stop_sequences": [],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": []
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
        "args": {
          "annotator_name": "call_center_summarization_pairwise_comparison",
          "key": "score"
        }
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "call_center_summarization_pairwise_comparison"
    ],
    "annotators": [
      {
        "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationPairwiseComparisonAnnotator",
        "args": {}
      }
    ]
  },
  {
    "name": "call_center_summarization_pairwise_comparison:model=meta_llama-3-8b-chat",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationPairwiseComparisonScenario",
      "args": {}
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Summarize the call transcript in under 10 sentences.",
      "input_prefix": "### Call Transcript\n",
      "input_suffix": "",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "",
      "output_suffix": "",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 0,
      "max_eval_instances": 1000,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "together/llama-3-8b-chat",
      "model": "meta/llama-3-8b-chat",
      "temperature": 0.0,
      "max_tokens": 512,
      "stop_sequences": [],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": []
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
        "args": {
          "annotator_name": "call_center_summarization_pairwise_comparison",
          "key": "score"
        }
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "call_center_summarization_pairwise_comparison"
    ],
    "annotators": [
      {
        "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationPairwiseComparisonAnnotator",
        "args": {}
      }
    ]
  },
  {
    "name": "call_center_summarization_pairwise_comparison:model=openai_gpt-4o-2024-05-13",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationPairwiseComparisonScenario",
      "args": {}
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Summarize the call transcript in under 10 sentences.",
      "input_prefix": "### Call Transcript\n",
      "input_suffix": "",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "",
      "output_suffix": "",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 0,
      "max_eval_instances": 1000,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "openai/gpt-4o-2024-05-13",
      "model": "openai/gpt-4o-2024-05-13",
      "temperature": 0.0,
      "max_tokens": 512,
      "stop_sequences": [],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": []
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
        "args": {
          "annotator_name": "call_center_summarization_pairwise_comparison",
          "key": "score"
        }
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "call_center_summarization_pairwise_comparison"
    ],
    "annotators": [
      {
        "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationPairwiseComparisonAnnotator",
        "args": {}
      }
    ]
  },
  {
    "name": "call_center_summarization_pairwise_comparison:model=openai_gpt-4o-mini-2024-07-18",
    "scenario_spec": {
      "class_name": "helm.benchmark.scenarios.call_center_scenario.CallCenterSummarizationPairwiseComparisonScenario",
      "args": {}
    },
    "adapter_spec": {
      "method": "generation",
      "global_prefix": "",
      "global_suffix": "",
      "instructions": "Summarize the call transcript in under 10 sentences.",
      "input_prefix": "### Call Transcript\n",
      "input_suffix": "",
      "reference_prefix": "A. ",
      "reference_suffix": "\n",
      "output_prefix": "",
      "output_suffix": "",
      "instance_prefix": "\n",
      "substitutions": [],
      "max_train_instances": 0,
      "max_eval_instances": 1000,
      "num_outputs": 1,
      "num_train_trials": 1,
      "num_trials": 1,
      "sample_train": true,
      "model_deployment": "openai/gpt-4o-mini-2024-07-18",
      "model": "openai/gpt-4o-mini-2024-07-18",
      "temperature": 0.0,
      "max_tokens": 512,
      "stop_sequences": [],
      "multi_label": false
    },
    "metric_specs": [
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
        "args": {
          "names": []
        }
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
        "args": {}
      },
      {
        "class_name": "helm.benchmark.metrics.annotation_metrics.AnnotationNumericMetric",
        "args": {
          "annotator_name": "call_center_summarization_pairwise_comparison",
          "key": "score"
        }
      }
    ],
    "data_augmenter_spec": {
      "perturbation_specs": [],
      "should_augment_train_instances": false,
      "should_include_original_train": false,
      "should_skip_unchanged_train": false,
      "should_augment_eval_instances": false,
      "should_include_original_eval": false,
      "should_skip_unchanged_eval": false,
      "seeds_per_instance": 1
    },
    "groups": [
      "call_center_summarization_pairwise_comparison"
    ],
    "annotators": [
      {
        "class_name": "helm.benchmark.annotation.call_center_annotator.CallCenterSummarizationPairwiseComparisonAnnotator",
        "args": {}
      }
    ]
  }
]