{
  "metrics": [
    {
      "name": "num_perplexity_tokens",
      "display_name": "# tokens",
      "description": "Average number of tokens in the predicted output (for language modeling, the input too)."
    },
    {
      "name": "num_bytes",
      "display_name": "# bytes",
      "description": "Average number of bytes in the predicted output (for language modeling, the input too)."
    },
    {
      "name": "num_references",
      "display_name": "# ref",
      "description": "Number of references."
    },
    {
      "name": "num_train_trials",
      "display_name": "# trials",
      "description": "Number of trials, where in each trial we choose an independent, random set of training instances."
    },
    {
      "name": "estimated_num_tokens_cost",
      "display_name": "cost",
      "description": "An estimate of the number of tokens (including prompt and output completions) needed to perform the request."
    },
    {
      "name": "num_prompt_tokens",
      "display_name": "# prompt tokens",
      "description": "Number of tokens in the prompt."
    },
    {
      "name": "num_prompt_characters",
      "display_name": "# prompt chars",
      "description": "Number of characters in the prompt."
    },
    {
      "name": "num_completion_tokens",
      "display_name": "# completion tokens",
      "description": "Actual number of completion tokens (over all completions)."
    },
    {
      "name": "num_output_tokens",
      "display_name": "# output tokens",
      "description": "Actual number of output tokens."
    },
    {
      "name": "max_num_output_tokens",
      "display_name": "Max output tokens",
      "description": "Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences)."
    },
    {
      "name": "num_requests",
      "display_name": "# requests",
      "description": "Number of distinct API requests."
    },
    {
      "name": "num_instances",
      "display_name": "# eval",
      "description": "Number of evaluation instances."
    },
    {
      "name": "num_train_instances",
      "display_name": "# train",
      "description": "Number of training instances (e.g., in-context examples)."
    },
    {
      "name": "prompt_truncated",
      "display_name": "truncated",
      "description": "Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples)."
    },
    {
      "name": "finish_reason_length",
      "display_name": "finish b/c length",
      "description": "Fraction of instances where the the output was terminated because of the max tokens limit."
    },
    {
      "name": "finish_reason_stop",
      "display_name": "finish b/c stop",
      "description": "Fraction of instances where the the output was terminated because of the stop sequences."
    },
    {
      "name": "finish_reason_endoftext",
      "display_name": "finish b/c endoftext",
      "description": "Fraction of instances where the the output was terminated because the end of text token was generated."
    },
    {
      "name": "finish_reason_unknown",
      "display_name": "finish b/c unknown",
      "description": "Fraction of instances where the the output was terminated for unknown reasons."
    },
    {
      "name": "num_completions",
      "display_name": "# completions",
      "description": "Number of completions."
    },
    {
      "name": "predicted_index",
      "display_name": "Predicted index",
      "description": "Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice)."
    },
    {
      "name": "exact_match",
      "display_name": "Exact match",
      "short_display_name": "EM",
      "description": "Fraction of instances that the predicted output matches a correct reference exactly.",
      "lower_is_better": false
    },
    {
      "name": "annotation_call_center_summarization_faithfulness",
      "display_name": "Faithfulness",
      "short_display_name": "Faithfulness",
      "description": "Whether all the information expressed by the summary can be inferred from the source transcript.",
      "lower_is_better": false
    },
    {
      "name": "annotation_call_center_summarization_relevance",
      "display_name": "Relevance",
      "short_display_name": "Relevance",
      "description": "Whether the summary includes only important information from the source.",
      "lower_is_better": false
    },
    {
      "name": "annotation_call_center_summarization_coherence",
      "display_name": "Coherence",
      "short_display_name": "Coherence",
      "description": "Whether the summary organizes the relevant information into a well-structured summary.",
      "lower_is_better": false
    },
    {
      "name": "annotation_call_center_summarization_pairwise_comparison_score",
      "display_name": "Pairwise",
      "short_display_name": "Pairwise",
      "description": "Whether the model's summary was preferred by the evaluator model",
      "lower_is_better": false
    },
    {
      "name": "annotation_call_center_summarization_key_points_recall_score",
      "display_name": "Recall",
      "short_display_name": "Recall",
      "description": "How many key items were recalled",
      "lower_is_better": false
    }
  ],
  "perturbations": [],
  "metric_groups": [
    {
      "name": "summarization_metrics",
      "display_name": "Summarization",
      "metrics": [
        {
          "name": "annotation_call_center_summarization_faithfulness",
          "split": "${main_split}"
        },
        {
          "name": "annotation_call_center_summarization_relevance",
          "split": "${main_split}"
        },
        {
          "name": "annotation_call_center_summarization_coherence",
          "split": "${main_split}"
        }
      ],
      "hide_win_rates": true
    },
    {
      "name": "pairwise_comparison_metrics",
      "display_name": "Pairwise Comparison",
      "metrics": [
        {
          "name": "annotation_call_center_summarization_pairwise_comparison_score",
          "split": "${main_split}"
        }
      ],
      "hide_win_rates": true
    },
    {
      "name": "key_points_recall_metrics",
      "display_name": "Key Points Recall",
      "metrics": [
        {
          "name": "annotation_call_center_summarization_key_points_recall_score",
          "split": "${main_split}"
        }
      ],
      "hide_win_rates": true
    },
    {
      "name": "efficiency",
      "display_name": "Efficiency",
      "metrics": [
        {
          "name": "inference_runtime",
          "split": "${main_split}"
        }
      ]
    },
    {
      "name": "general_information",
      "display_name": "General information",
      "metrics": [
        {
          "name": "num_instances",
          "split": "${main_split}"
        },
        {
          "name": "num_train_instances",
          "split": "${main_split}"
        },
        {
          "name": "prompt_truncated",
          "split": "${main_split}"
        },
        {
          "name": "num_prompt_tokens",
          "split": "${main_split}"
        },
        {
          "name": "num_output_tokens",
          "split": "${main_split}"
        }
      ],
      "hide_win_rates": true
    }
  ],
  "run_groups": [
    {
      "name": "call_center_scenarios",
      "display_name": "Call Center Scenarios",
      "description": "Scenarios representating realistic tasks from the call center.",
      "metric_groups": [],
      "subgroups": [
        "call_center_summarization",
        "call_center_summarization_real_call_transcripts",
        "call_center_summarization_pairwise_comparison",
        "call_center_summarization_key_points_recall"
      ],
      "subgroup_display_mode": "by_metric",
      "subgroup_metric_groups_hidden": [],
      "environment": {},
      "category": "All scenarios",
      "visibility": "all_groups",
      "todo": false,
      "adapter_keys_shown": [
        "model_deployment",
        "model"
      ]
    },
    {
      "name": "call_center_summarization",
      "display_name": "Summarization",
      "description": "summarization",
      "metric_groups": [
        "summarization_metrics",
        "efficiency",
        "general_information"
      ],
      "subgroups": [],
      "subgroup_display_mode": "by_metric",
      "subgroup_metric_groups_hidden": [],
      "environment": {
        "main_split": "test"
      },
      "category": "Scenarios",
      "visibility": "all_groups",
      "taxonomy": {
        "task": "summarization",
        "what": "n/a",
        "when": "?",
        "who": "n/a",
        "language": "English"
      },
      "todo": false,
      "adapter_keys_shown": [
        "model_deployment",
        "model"
      ]
    },
    {
      "name": "call_center_summarization_real_call_transcripts",
      "display_name": "Summarization (Real)",
      "description": "Summarization with real call transcripts",
      "metric_groups": [
        "summarization_metrics",
        "efficiency",
        "general_information"
      ],
      "subgroups": [],
      "subgroup_display_mode": "by_metric",
      "subgroup_metric_groups_hidden": [],
      "environment": {
        "main_split": "test"
      },
      "category": "Scenarios",
      "visibility": "all_groups",
      "taxonomy": {
        "task": "summarization",
        "what": "n/a",
        "when": "?",
        "who": "n/a",
        "language": "English"
      },
      "todo": false,
      "adapter_keys_shown": [
        "model_deployment",
        "model"
      ]
    },
    {
      "name": "call_center_summarization_pairwise_comparison",
      "display_name": "Summarization (Pairwise)",
      "description": "summarization",
      "metric_groups": [
        "pairwise_comparison_metrics",
        "efficiency",
        "general_information"
      ],
      "subgroups": [],
      "subgroup_display_mode": "by_metric",
      "subgroup_metric_groups_hidden": [],
      "environment": {
        "main_split": "test"
      },
      "category": "Scenarios",
      "visibility": "all_groups",
      "taxonomy": {
        "task": "summarization",
        "what": "n/a",
        "when": "?",
        "who": "n/a",
        "language": "English"
      },
      "todo": false,
      "adapter_keys_shown": [
        "model_deployment",
        "model"
      ]
    },
    {
      "name": "call_center_summarization_key_points_recall",
      "display_name": "Summarization (Key Points Recall)",
      "description": "summarization",
      "metric_groups": [
        "key_points_recall_metrics",
        "efficiency",
        "general_information"
      ],
      "subgroups": [],
      "subgroup_display_mode": "by_metric",
      "subgroup_metric_groups_hidden": [],
      "environment": {
        "main_split": "test"
      },
      "category": "Scenarios",
      "visibility": "all_groups",
      "taxonomy": {
        "task": "summarization",
        "what": "n/a",
        "when": "?",
        "who": "n/a",
        "language": "English"
      },
      "todo": false,
      "adapter_keys_shown": [
        "model_deployment",
        "model"
      ]
    }
  ],
  "adapter": [
    {
      "name": "method",
      "description": "The high-level strategy for converting instances into a prompt for the language model."
    },
    {
      "name": "global_prefix",
      "description": "The string that is prepended to the entire prompt."
    },
    {
      "name": "global_suffix",
      "description": "The string that is appended to the entire prompt."
    },
    {
      "name": "instructions",
      "description": "The description of the task that is included at the very beginning of the prompt."
    },
    {
      "name": "input_prefix",
      "description": "The string that is included before each input (e.g., 'Question:')."
    },
    {
      "name": "input_suffix",
      "description": "The string that is included after each input (e.g., '\\n')."
    },
    {
      "name": "reference_prefix",
      "description": "The string that is included before each reference (for multiple-choice questions)."
    },
    {
      "name": "reference_suffix",
      "description": "The string that is included after each reference (for multiple-choice questions)."
    },
    {
      "name": "output_prefix",
      "description": "The string that is included before the correct answer/predicted output (e.g., 'Answer:')."
    },
    {
      "name": "output_suffix",
      "description": "The string that is included after the correct answer/predicted output (e.g., '\\n')."
    },
    {
      "name": "instance_prefix",
      "description": "The string that is included before each instance (e.g., '\\n\\n')."
    },
    {
      "name": "substitutions",
      "description": "A list of regular expression substitutions (e.g., replacing '\\n' with ';\\n') to perform at the very end on the prompt."
    },
    {
      "name": "max_train_instances",
      "description": "Maximum number of training instances to include in the prompt (currently by randomly sampling)."
    },
    {
      "name": "max_eval_instances",
      "description": "Maximum number of instances to evaluate on (over all splits - test, valid, etc.)."
    },
    {
      "name": "num_outputs",
      "description": "Maximum number of possible outputs to generate by sampling multiple outputs."
    },
    {
      "name": "num_train_trials",
      "description": "Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance."
    },
    {
      "name": "num_trials",
      "description": "Number of trials, where we query the model with the same requests, but different random seeds."
    },
    {
      "name": "sample_train",
      "description": "If true, randomly sample N training examples; if false, select N consecutive training examples"
    },
    {
      "name": "model_deployment",
      "description": "Name of the language model deployment (<host_organization>/<model name>) to send requests to."
    },
    {
      "name": "model",
      "description": "Name of the language model (<creator_organization>/<model name>) to send requests to."
    },
    {
      "name": "temperature",
      "description": "Temperature parameter used in generation."
    },
    {
      "name": "max_tokens",
      "description": "Maximum number of tokens to generate."
    },
    {
      "name": "stop_sequences",
      "description": "List of stop sequences. Output generation will be stopped if any stop sequence is encountered."
    },
    {
      "name": "random",
      "description": "Random seed (string), which guarantees reproducibility."
    },
    {
      "name": "multi_label",
      "description": "If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references."
    },
    {
      "name": "image_generation_parameters",
      "description": "Parameters for image generation."
    },
    {
      "name": "eval_splits",
      "description": "The splits from which evaluation instances will be drawn."
    }
  ],
  "models": [
    {
      "name": "anthropic/claude-3-5-sonnet-20240620",
      "display_name": "Claude 3.5 Sonnet (20240620)",
      "short_display_name": "Claude 3.5 Sonnet (20240620)",
      "description": "Claude 3.5 Sonnet is a Claude 3 family model which outperforms Claude 3 Opus while operating faster and at a lower cost. ([blog](https://www.anthropic.com/news/claude-3-5-sonnet))",
      "creator_organization": "Anthropic",
      "access": "limited",
      "todo": false,
      "release_date": "2024-06-20"
    },
    {
      "name": "meta/llama-3-8b-chat",
      "display_name": "Llama 3 Instruct (8B)",
      "short_display_name": "Llama 3 Instruct (8B)",
      "description": "Llama 3 is a family of language models that have been trained on more than 15 trillion tokens, and use Grouped-Query Attention (GQA) for improved inference scalability. It used SFT, rejection sampling, PPO and DPO for post-training. ([paper](https://ai.meta.com/research/publications/the-llama-3-herd-of-models/)",
      "creator_organization": "Meta",
      "access": "open",
      "todo": false,
      "release_date": "2024-04-18",
      "num_parameters": 8000000000
    },
    {
      "name": "meta/llama-3-70b-chat",
      "display_name": "Llama 3 Instruct (70B)",
      "short_display_name": "Llama 3 Instruct (70B)",
      "description": "Llama 3 is a family of language models that have been trained on more than 15 trillion tokens, and use Grouped-Query Attention (GQA) for improved inference scalability. It used SFT, rejection sampling, PPO and DPO for post-training. ([paper](https://ai.meta.com/research/publications/the-llama-3-herd-of-models/)",
      "creator_organization": "Meta",
      "access": "open",
      "todo": false,
      "release_date": "2024-04-18",
      "num_parameters": 70000000000
    },
    {
      "name": "openai/gpt-4o-2024-05-13",
      "display_name": "GPT-4o (2024-05-13)",
      "short_display_name": "GPT-4o (2024-05-13)",
      "description": "GPT-4o (2024-05-13) is a large multimodal model that accepts as input any combination of text, audio, and image and generates any combination of text, audio, and image outputs. ([blog](https://openai.com/index/hello-gpt-4o/))",
      "creator_organization": "OpenAI",
      "access": "limited",
      "todo": false,
      "release_date": "2024-04-09"
    },
    {
      "name": "openai/gpt-4o-mini-2024-07-18",
      "display_name": "GPT-4o mini (2024-07-18)",
      "short_display_name": "GPT-4o mini (2024-07-18)",
      "description": "GPT-4o mini (2024-07-18) is a multimodal model with a context window of 128K tokens and improved handling of non-English text. ([blog](https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/))",
      "creator_organization": "OpenAI",
      "access": "limited",
      "todo": false,
      "release_date": "2024-07-18"
    }
  ]
}