{
  "metrics": [
    {
      "name": "num_perplexity_tokens",
      "display_name": "# tokens",
      "description": "Average number of tokens in the predicted output (for language modeling, the input too)."
    },
    {
      "name": "num_bytes",
      "display_name": "# bytes",
      "description": "Average number of bytes in the predicted output (for language modeling, the input too)."
    },
    {
      "name": "num_references",
      "display_name": "# ref",
      "description": "Number of references."
    },
    {
      "name": "num_train_trials",
      "display_name": "# trials",
      "description": "Number of trials, where in each trial we choose an independent, random set of training instances."
    },
    {
      "name": "estimated_num_tokens_cost",
      "display_name": "cost",
      "description": "An estimate of the number of tokens (including prompt and output completions) needed to perform the request."
    },
    {
      "name": "num_prompt_tokens",
      "display_name": "# prompt tokens",
      "description": "Number of tokens in the prompt."
    },
    {
      "name": "num_prompt_characters",
      "display_name": "# prompt chars",
      "description": "Number of characters in the prompt."
    },
    {
      "name": "num_completion_tokens",
      "display_name": "# completion tokens",
      "description": "Actual number of completion tokens (over all completions)."
    },
    {
      "name": "num_output_tokens",
      "display_name": "# output tokens",
      "description": "Actual number of output tokens."
    },
    {
      "name": "max_num_output_tokens",
      "display_name": "Max output tokens",
      "description": "Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences)."
    },
    {
      "name": "num_requests",
      "display_name": "# requests",
      "description": "Number of distinct API requests."
    },
    {
      "name": "num_instances",
      "display_name": "# eval",
      "description": "Number of evaluation instances."
    },
    {
      "name": "num_train_instances",
      "display_name": "# train",
      "description": "Number of training instances (e.g., in-context examples)."
    },
    {
      "name": "prompt_truncated",
      "display_name": "truncated",
      "description": "Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples)."
    },
    {
      "name": "finish_reason_length",
      "display_name": "finish b/c length",
      "description": "Fraction of instances where the the output was terminated because of the max tokens limit."
    },
    {
      "name": "finish_reason_stop",
      "display_name": "finish b/c stop",
      "description": "Fraction of instances where the the output was terminated because of the stop sequences."
    },
    {
      "name": "finish_reason_endoftext",
      "display_name": "finish b/c endoftext",
      "description": "Fraction of instances where the the output was terminated because the end of text token was generated."
    },
    {
      "name": "finish_reason_unknown",
      "display_name": "finish b/c unknown",
      "description": "Fraction of instances where the the output was terminated for unknown reasons."
    },
    {
      "name": "num_completions",
      "display_name": "# completions",
      "description": "Number of completions."
    },
    {
      "name": "predicted_index",
      "display_name": "Predicted index",
      "description": "Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice)."
    },
    {
      "name": "exact_match",
      "display_name": "Exact match",
      "short_display_name": "EM",
      "description": "Fraction of instances that the predicted output matches a correct reference exactly.",
      "lower_is_better": false
    },
    {
      "name": "annotation_call_center_summarization_faithfulness",
      "display_name": "Faithfulness",
      "short_display_name": "Faithfulness",
      "description": "Whether all the information expressed by the summary can be inferred from the source transcript.",
      "lower_is_better": false
    },
    {
      "name": "annotation_call_center_summarization_relevance",
      "display_name": "Relevance",
      "short_display_name": "Relevance",
      "description": "Whether the summary includes only important information from the source.",
      "lower_is_better": false
    },
    {
      "name": "annotation_call_center_summarization_coherence",
      "display_name": "Coherence",
      "short_display_name": "Coherence",
      "description": "Whether the summary organizes the relevant information into a well-structured summary.",
      "lower_is_better": false
    },
    {
      "name": "annotation_call_center_summarization_pairwise_comparison_score",
      "display_name": "Pairwise",
      "short_display_name": "Pairwise",
      "description": "Whether the model's summary was preferred by the evaluator model",
      "lower_is_better": false
    },
    {
      "name": "annotation_call_center_summarization_key_points_recall_score",
      "display_name": "Recall",
      "short_display_name": "Recall",
      "description": "How many key items were recalled",
      "lower_is_better": false
    },
    {
      "name": "annotation_helpdesk_call_center_summarization_score",
      "display_name": "Score",
      "short_display_name": "Score",
      "description": "Score",
      "lower_is_better": false
    },
    {
      "name": "call_summarization_score",
      "display_name": "Score",
      "short_display_name": "Score",
      "description": "Score",
      "lower_is_better": false
    }
  ],
  "perturbations": [],
  "metric_groups": [
    {
      "name": "summarization_metrics",
      "display_name": "Summarization",
      "metrics": [
        {
          "name": "call_summarization_score",
          "split": "${main_split}"
        }
      ],
      "hide_win_rates": true
    },
    {
      "name": "pairwise_comparison_metrics",
      "display_name": "Pairwise Comparison",
      "metrics": [
        {
          "name": "annotation_call_center_summarization_pairwise_comparison_score",
          "split": "${main_split}"
        }
      ],
      "hide_win_rates": true
    },
    {
      "name": "key_points_recall_metrics",
      "metrics": [
        {
          "name": "annotation_call_center_summarization_key_points_recall_score",
          "split": "${main_split}"
        }
      ],
      "hide_win_rates": true
    },
    {
      "name": "efficiency",
      "display_name": "Efficiency",
      "metrics": [
        {
          "name": "inference_runtime",
          "split": "${main_split}"
        }
      ]
    },
    {
      "name": "general_information",
      "display_name": "General information",
      "metrics": [
        {
          "name": "num_instances",
          "split": "${main_split}"
        },
        {
          "name": "num_train_instances",
          "split": "${main_split}"
        },
        {
          "name": "prompt_truncated",
          "split": "${main_split}"
        },
        {
          "name": "num_prompt_tokens",
          "split": "${main_split}"
        },
        {
          "name": "num_output_tokens",
          "split": "${main_split}"
        }
      ],
      "hide_win_rates": true
    }
  ],
  "run_groups": [
    {
      "name": "call_center_scenarios",
      "display_name": "Call Center Scenarios",
      "description": "Scenarios representating realistic tasks from the call center.",
      "metric_groups": [],
      "subgroups": [
        "helpdesk_call_summarization"
      ],
      "subgroup_display_mode": "by_metric",
      "subgroup_metric_groups_hidden": [],
      "environment": {},
      "category": "All scenarios",
      "visibility": "all_groups",
      "todo": false,
      "adapter_keys_shown": [
        "model_deployment",
        "model"
      ]
    },
    {
      "name": "helpdesk_call_summarization",
      "display_name": "Helpdesk Call summarization",
      "description": "Helpdesk Call summarization",
      "metric_groups": [
        "summarization_metrics",
        "efficiency",
        "general_information"
      ],
      "subgroups": [],
      "subgroup_display_mode": "by_metric",
      "subgroup_metric_groups_hidden": [],
      "environment": {
        "main_split": "test"
      },
      "category": "Scenarios",
      "visibility": "all_groups",
      "taxonomy": {
        "task": "summarization",
        "what": "n/a",
        "when": "?",
        "who": "n/a",
        "language": "English"
      },
      "todo": false,
      "adapter_keys_shown": [
        "model_deployment",
        "model"
      ]
    }
  ],
  "adapter": [
    {
      "name": "method",
      "description": "The high-level strategy for converting instances into a prompt for the language model."
    },
    {
      "name": "global_prefix",
      "description": "The string that is prepended to the entire prompt."
    },
    {
      "name": "global_suffix",
      "description": "The string that is appended to the entire prompt."
    },
    {
      "name": "instructions",
      "description": "The description of the task that is included at the very beginning of the prompt."
    },
    {
      "name": "input_prefix",
      "description": "The string that is included before each input (e.g., 'Question:')."
    },
    {
      "name": "input_suffix",
      "description": "The string that is included after each input (e.g., '\\n')."
    },
    {
      "name": "reference_prefix",
      "description": "The string that is included before each reference (for multiple-choice questions)."
    },
    {
      "name": "reference_suffix",
      "description": "The string that is included after each reference (for multiple-choice questions)."
    },
    {
      "name": "chain_of_thought_prefix",
      "description": "The string that is included before each chain of thought. (e.g., 'Let's think step by step')"
    },
    {
      "name": "chain_of_thought_suffix",
      "description": "The string that is included after each chain of thought. (e.g., 'The correct answer is')"
    },
    {
      "name": "output_prefix",
      "description": "The string that is included before the correct answer/predicted output (e.g., 'Answer:')."
    },
    {
      "name": "output_suffix",
      "description": "The string that is included after the correct answer/predicted output (e.g., '\\n')."
    },
    {
      "name": "instance_prefix",
      "description": "The string that is included before each instance (e.g., '\\n\\n')."
    },
    {
      "name": "substitutions",
      "description": "A list of regular expression substitutions (e.g., replacing '\\n' with ';\\n') to perform at the very end on the prompt."
    },
    {
      "name": "max_train_instances",
      "description": "Maximum number of training instances to include in the prompt (currently by randomly sampling)."
    },
    {
      "name": "max_eval_instances",
      "description": "Maximum number of instances to evaluate on (over all splits - test, valid, etc.)."
    },
    {
      "name": "num_outputs",
      "description": "Maximum number of possible outputs to generate by sampling multiple outputs."
    },
    {
      "name": "num_train_trials",
      "description": "Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance."
    },
    {
      "name": "num_trials",
      "description": "Number of trials, where we query the model with the same requests, but different random seeds."
    },
    {
      "name": "sample_train",
      "description": "If true, randomly sample N training examples; if false, select N consecutive training examples"
    },
    {
      "name": "model_deployment",
      "description": "Name of the language model deployment (<host_organization>/<model name>) to send requests to."
    },
    {
      "name": "model",
      "description": "Name of the language model (<creator_organization>/<model name>) to send requests to."
    },
    {
      "name": "temperature",
      "description": "Temperature parameter used in generation."
    },
    {
      "name": "max_tokens",
      "description": "Maximum number of tokens to generate."
    },
    {
      "name": "stop_sequences",
      "description": "List of stop sequences. Output generation will be stopped if any stop sequence is encountered."
    },
    {
      "name": "random",
      "description": "Random seed (string), which guarantees reproducibility."
    },
    {
      "name": "multi_label",
      "description": "If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references."
    },
    {
      "name": "image_generation_parameters",
      "description": "Parameters for image generation."
    },
    {
      "name": "eval_splits",
      "description": "The splits from which evaluation instances will be drawn."
    }
  ],
  "models": [
    {
      "name": "anthropic/claude-3-5-haiku-20241022",
      "display_name": "Claude 3.5 Haiku (20241022)",
      "short_display_name": "Claude 3.5 Haiku (20241022)",
      "description": "Claude 3.5 Haiku is a Claude 3 family model which matches the performance of Claude 3 Opus at a similar speed to the previous generation of Haiku ([blog](https://www.anthropic.com/news/3-5-models-and-computer-use)).",
      "creator_organization": "Anthropic",
      "access": "limited",
      "todo": false,
      "release_date": "2024-11-04"
    },
    {
      "name": "anthropic/claude-3-5-sonnet-20240620",
      "display_name": "Claude 3.5 Sonnet (20240620)",
      "short_display_name": "Claude 3.5 Sonnet (20240620)",
      "description": "Claude 3.5 Sonnet is a Claude 3 family model which outperforms Claude 3 Opus while operating faster and at a lower cost. ([blog](https://www.anthropic.com/news/claude-3-5-sonnet))",
      "creator_organization": "Anthropic",
      "access": "limited",
      "todo": false,
      "release_date": "2024-06-20"
    },
    {
      "name": "anthropic/claude-3-7-sonnet-20250219",
      "display_name": "Claude 3.7 Sonnet (20250219)",
      "short_display_name": "Claude 3.7 Sonnet (20250219)",
      "description": "Claude 3.7 Sonnet is a Claude 3 family hybrid reasoning model that can produce near-instant responses or extended, step-by-step thinking that is made visible to the user ([blog](https://www.anthropic.com/news/claude-3-7-sonnet)).",
      "creator_organization": "Anthropic",
      "access": "limited",
      "todo": false,
      "release_date": "2025-02-24"
    },
    {
      "name": "deepseek-ai/deepseek-v3",
      "display_name": "DeepSeek v3",
      "short_display_name": "DeepSeek v3",
      "description": "DeepSeek v3 a Mixture-of-Experts (MoE) language model with 671B total parameters with 37B activated for each token. It adopts Multi-head Latent Attention (MLA) and DeepSeekMoE architectures. ([paper](https://github.com/deepseek-ai/DeepSeek-V3/blob/main/DeepSeek_V3.pdf))",
      "creator_organization": "DeepSeek",
      "access": "open",
      "todo": false,
      "release_date": "2024-12-24",
      "num_parameters": 685000000000
    },
    {
      "name": "google/gemini-1.5-pro-002",
      "display_name": "Gemini 1.5 Pro (002)",
      "short_display_name": "Gemini 1.5 Pro (002)",
      "description": "Gemini 1.5 Pro is a multimodal mixture-of-experts model capable of recalling and reasoning over fine-grained information from long contexts. This model is accessed through Vertex AI and has all safety thresholds set to `BLOCK_NONE`. ([paper](https://arxiv.org/abs/2403.05530))",
      "creator_organization": "Google",
      "access": "limited",
      "todo": false,
      "release_date": "2024-09-24"
    },
    {
      "name": "google/gemini-1.5-flash-002",
      "display_name": "Gemini 1.5 Flash (002)",
      "short_display_name": "Gemini 1.5 Flash (002)",
      "description": "Gemini 1.5 Flash is a multimodal mixture-of-experts model capable of recalling and reasoning over fine-grained information from long contexts. This model is accessed through Vertex AI and has all safety thresholds set to `BLOCK_NONE`. ([paper](https://arxiv.org/abs/2403.05530))",
      "creator_organization": "Google",
      "access": "limited",
      "todo": false,
      "release_date": "2024-09-24"
    },
    {
      "name": "google/gemini-2.0-flash-001",
      "display_name": "Gemini 2.0 Flash",
      "short_display_name": "Gemini 2.0 Flash",
      "description": "Gemini 2.0 Flash ([documentation](https://ai.google.dev/gemini-api/docs/models/gemini))",
      "creator_organization": "Google",
      "access": "limited",
      "todo": false,
      "release_date": "2025-02-01"
    },
    {
      "name": "meta/llama-3.1-8b-instruct-turbo",
      "display_name": "Llama 3.1 Instruct Turbo (8B)",
      "short_display_name": "Llama 3.1 Instruct Turbo (8B)",
      "description": "Llama 3.1 (8B) is part of the Llama 3 family of dense Transformer models that that natively support multilinguality, coding, reasoning, and tool usage. ([paper](https://ai.meta.com/research/publications/the-llama-3-herd-of-models/), [blog](https://ai.meta.com/blog/meta-llama-3-1/)) Turbo is Together's implementation, providing a near negligible difference in quality from the reference implementation with faster performance and lower cost, currently using FP8 quantization. ([blog](https://www.together.ai/blog/llama-31-quality))",
      "creator_organization": "Meta",
      "access": "open",
      "todo": false,
      "release_date": "2024-07-23",
      "num_parameters": 8000000000
    },
    {
      "name": "meta/llama-3.1-70b-instruct-turbo",
      "display_name": "Llama 3.1 Instruct Turbo (70B)",
      "short_display_name": "Llama 3.1 Instruct Turbo (70B)",
      "description": "Llama 3.1 (70B) is part of the Llama 3 family of dense Transformer models that that natively support multilinguality, coding, reasoning, and tool usage. ([paper](https://ai.meta.com/research/publications/the-llama-3-herd-of-models/), [blog](https://ai.meta.com/blog/meta-llama-3-1/)) Turbo is Together's implementation, providing a near negligible difference in quality from the reference implementation with faster performance and lower cost, currently using FP8 quantization. ([blog](https://www.together.ai/blog/llama-31-quality))",
      "creator_organization": "Meta",
      "access": "open",
      "todo": false,
      "release_date": "2024-07-23",
      "num_parameters": 70000000000
    },
    {
      "name": "meta/llama-3.1-405b-instruct-turbo",
      "display_name": "Llama 3.1 Instruct Turbo (405B)",
      "short_display_name": "Llama 3.1 Instruct Turbo (405B)",
      "description": "Llama 3.1 (405B) is part of the Llama 3 family of dense Transformer models that that natively support multilinguality, coding, reasoning, and tool usage. ([paper](https://ai.meta.com/research/publications/the-llama-3-herd-of-models/), [blog](https://ai.meta.com/blog/meta-llama-3-1/)) Turbo is Together's implementation, providing a near negligible difference in quality from the reference implementation with faster performance and lower cost, currently using FP8 quantization. ([blog](https://www.together.ai/blog/llama-31-quality))",
      "creator_organization": "Meta",
      "access": "open",
      "todo": false,
      "release_date": "2024-07-23",
      "num_parameters": 405000000000
    },
    {
      "name": "mistralai/mistral-7b-instruct-v0.3",
      "display_name": "Mistral Instruct v0.3 (7B)",
      "short_display_name": "Mistral Instruct v0.3 (7B)",
      "description": "Mistral v0.3 Instruct 7B is a 7.3B parameter transformer model that uses Grouped-Query Attention (GQA). Compared to v0.1, v0.2 has a 32k context window and no Sliding-Window Attention (SWA). ([blog post](https://mistral.ai/news/la-plateforme/))",
      "creator_organization": "Mistral AI",
      "access": "open",
      "todo": false,
      "release_date": "2024-05-22",
      "num_parameters": 7300000000
    },
    {
      "name": "mistralai/mixtral-8x7b-instruct-v0.1",
      "display_name": "Mixtral Instruct (8x7B)",
      "short_display_name": "Mixtral Instruct (8x7B)",
      "description": "Mixtral Instruct (8x7B) is a version of Mixtral (8x7B) that was optimized through supervised fine-tuning and direct preference optimisation (DPO) for careful instruction following. ([blog post](https://mistral.ai/news/mixtral-of-experts/)).",
      "creator_organization": "Mistral AI",
      "access": "open",
      "todo": false,
      "release_date": "2023-12-11",
      "num_parameters": 46700000000
    },
    {
      "name": "mistralai/mixtral-8x22b-instruct-v0.1",
      "display_name": "Mixtral Instruct (8x22B)",
      "short_display_name": "Mixtral Instruct (8x22B)",
      "description": "Mistral AI's mixture-of-experts model that uses 39B active parameters out of 141B ([blog post](https://mistral.ai/news/mixtral-8x22b/)).",
      "creator_organization": "Mistral AI",
      "access": "open",
      "todo": false,
      "release_date": "2024-04-10",
      "num_parameters": 176000000000
    },
    {
      "name": "openai/gpt-4o-2024-11-20",
      "display_name": "GPT-4o (2024-11-20)",
      "short_display_name": "GPT-4o (2024-11-20)",
      "description": "GPT-4o (2024-11-20) is a large multimodal model that accepts as input any combination of text, audio, and image and generates any combination of text, audio, and image outputs. ([blog](https://openai.com/index/introducing-structured-outputs-in-the-api/))",
      "creator_organization": "OpenAI",
      "access": "limited",
      "todo": false,
      "release_date": "2024-11-20"
    },
    {
      "name": "openai/gpt-4o-mini-2024-07-18",
      "display_name": "GPT-4o mini (2024-07-18)",
      "short_display_name": "GPT-4o mini (2024-07-18)",
      "description": "GPT-4o mini (2024-07-18) is a multimodal model with a context window of 128K tokens and improved handling of non-English text. ([blog](https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/))",
      "creator_organization": "OpenAI",
      "access": "limited",
      "todo": false,
      "release_date": "2024-07-18"
    },
    {
      "name": "qwen/qwen2.5-7b-instruct-turbo",
      "display_name": "Qwen2.5 Instruct Turbo (7B)",
      "short_display_name": "Qwen2.5 Instruct Turbo (7B)",
      "description": "Qwen2.5 Instruct Turbo (7B) was trained on 18 trillion tokens and supports 29 languages, and shows improvements over Qwen2 in knowledge, coding, mathematics, instruction following, generating long texts, and processing structure data. ([blog](https://qwenlm.github.io/blog/qwen2.5/)) Turbo is Together's cost-efficient implementation, providing fast FP8 performance while maintaining quality, closely matching FP16 reference models. ([blog](https://www.together.ai/blog/together-inference-engine-2))",
      "creator_organization": "Qwen",
      "access": "open",
      "todo": false,
      "release_date": "2024-09-19"
    },
    {
      "name": "qwen/qwen2.5-72b-instruct-turbo",
      "display_name": "Qwen2.5 Instruct Turbo (72B)",
      "short_display_name": "Qwen2.5 Instruct Turbo (72B)",
      "description": "Qwen2.5 Instruct Turbo (72B) was trained on 18 trillion tokens and supports 29 languages, and shows improvements over Qwen2 in knowledge, coding, mathematics, instruction following, generating long texts, and processing structure data. ([blog](https://qwenlm.github.io/blog/qwen2.5/)) Turbo is Together's cost-efficient implementation, providing fast FP8 performance while maintaining quality, closely matching FP16 reference models. ([blog](https://www.together.ai/blog/together-inference-engine-2))",
      "creator_organization": "Qwen",
      "access": "open",
      "todo": false,
      "release_date": "2024-09-19"
    }
  ]
}