{
  "metrics": [
    {
      "name": "num_perplexity_tokens",
      "display_name": "# tokens",
      "description": "Average number of tokens in the predicted output (for language modeling, the input too)."
    },
    {
      "name": "num_bytes",
      "display_name": "# bytes",
      "description": "Average number of bytes in the predicted output (for language modeling, the input too)."
    },
    {
      "name": "num_references",
      "display_name": "# ref",
      "description": "Number of references."
    },
    {
      "name": "num_train_trials",
      "display_name": "# trials",
      "description": "Number of trials, where in each trial we choose an independent, random set of training instances."
    },
    {
      "name": "estimated_num_tokens_cost",
      "display_name": "cost",
      "description": "An estimate of the number of tokens (including prompt and output completions) needed to perform the request."
    },
    {
      "name": "num_prompt_tokens",
      "display_name": "# prompt tokens",
      "description": "Number of tokens in the prompt."
    },
    {
      "name": "num_prompt_characters",
      "display_name": "# prompt chars",
      "description": "Number of characters in the prompt."
    },
    {
      "name": "num_completion_tokens",
      "display_name": "# completion tokens",
      "description": "Actual number of completion tokens (over all completions)."
    },
    {
      "name": "num_output_tokens",
      "display_name": "# output tokens",
      "description": "Actual number of output tokens."
    },
    {
      "name": "max_num_output_tokens",
      "display_name": "Max output tokens",
      "description": "Maximum number of output tokens (overestimate since we might stop earlier due to stop sequences)."
    },
    {
      "name": "num_requests",
      "display_name": "# requests",
      "description": "Number of distinct API requests."
    },
    {
      "name": "num_instances",
      "display_name": "# eval",
      "description": "Number of evaluation instances."
    },
    {
      "name": "num_train_instances",
      "display_name": "# train",
      "description": "Number of training instances (e.g., in-context examples)."
    },
    {
      "name": "prompt_truncated",
      "display_name": "truncated",
      "description": "Fraction of instances where the prompt itself was truncated (implies that there were no in-context examples)."
    },
    {
      "name": "finish_reason_length",
      "display_name": "finish b/c length",
      "description": "Fraction of instances where the the output was terminated because of the max tokens limit."
    },
    {
      "name": "finish_reason_stop",
      "display_name": "finish b/c stop",
      "description": "Fraction of instances where the the output was terminated because of the stop sequences."
    },
    {
      "name": "finish_reason_endoftext",
      "display_name": "finish b/c endoftext",
      "description": "Fraction of instances where the the output was terminated because the end of text token was generated."
    },
    {
      "name": "finish_reason_unknown",
      "display_name": "finish b/c unknown",
      "description": "Fraction of instances where the the output was terminated for unknown reasons."
    },
    {
      "name": "num_completions",
      "display_name": "# completions",
      "description": "Number of completions."
    },
    {
      "name": "predicted_index",
      "display_name": "Predicted index",
      "description": "Integer index of the reference (0, 1, ...) that was predicted by the model (for multiple-choice)."
    },
    {
      "name": "program_accuracy",
      "display_name": "Program Accuracy",
      "description": "Accuracy of the generated programs",
      "lower_is_better": false
    },
    {
      "name": "execution_accuracy",
      "display_name": "Execution Accuracy",
      "description": "Accuracy of the final result of the generated program",
      "lower_is_better": false
    },
    {
      "name": "annotation_financebench_label_correct_answer",
      "display_name": "Correct Answer",
      "description": "Whether the final result was correct, as judged by a GPT-4o",
      "lower_is_better": false
    },
    {
      "name": "quasi_exact_match",
      "display_name": "Quasi-exact match",
      "short_display_name": "EM",
      "description": "Fraction of instances that the predicted output matches a correct reference up to light processing.",
      "lower_is_better": false
    },
    {
      "name": "error_rate",
      "display_name": "SQL Error Rate",
      "short_display_name": "SQL Error Rate",
      "description": "Fraction of generated queries that result in a SQL execution error",
      "lower_is_better": true
    },
    {
      "name": "execution_accuracy",
      "display_name": "Execution Accuracy",
      "short_display_name": "Execution Accuracy",
      "description": "Execution Accuracy",
      "lower_is_better": false
    }
  ],
  "perturbations": [],
  "metric_groups": [
    {
      "name": "accuracy",
      "display_name": "Accuracy",
      "metrics": [
        {
          "name": "${main_name}",
          "split": "${main_split}"
        }
      ],
      "hide_win_rates": true
    },
    {
      "name": "efficiency",
      "display_name": "Efficiency",
      "metrics": [
        {
          "name": "inference_runtime",
          "split": "${main_split}"
        }
      ]
    },
    {
      "name": "general_information",
      "display_name": "General information",
      "metrics": [
        {
          "name": "num_instances",
          "split": "${main_split}"
        },
        {
          "name": "num_train_instances",
          "split": "${main_split}"
        },
        {
          "name": "prompt_truncated",
          "split": "${main_split}"
        },
        {
          "name": "num_prompt_tokens",
          "split": "${main_split}"
        },
        {
          "name": "num_output_tokens",
          "split": "${main_split}"
        }
      ],
      "hide_win_rates": true
    }
  ],
  "run_groups": [
    {
      "name": "text_to_sql_scenarios",
      "display_name": "Text-to-SQL Scenarios",
      "description": "Text-to-SQL Scenarios",
      "metric_groups": [],
      "subgroups": [
        "spider",
        "bird_sql"
      ],
      "subgroup_display_mode": "by_metric",
      "subgroup_metric_groups_hidden": [],
      "environment": {},
      "category": "All scenarios",
      "visibility": "all_groups",
      "todo": false,
      "adapter_keys_shown": [
        "model_deployment",
        "model"
      ]
    },
    {
      "name": "spider",
      "display_name": "Spider 1.0 (Test)",
      "description": "Spider 1.0 (Test)",
      "metric_groups": [
        "accuracy",
        "efficiency",
        "general_information"
      ],
      "subgroups": [],
      "subgroup_display_mode": "by_metric",
      "subgroup_metric_groups_hidden": [],
      "environment": {
        "main_name": "execution_accuracy",
        "main_split": "valid"
      },
      "category": "Scenarios",
      "visibility": "all_groups",
      "taxonomy": {
        "task": "text-to-SQL",
        "what": "databases from various domains",
        "when": "?",
        "who": "expert data scientists",
        "language": "English"
      },
      "todo": false,
      "adapter_keys_shown": [
        "model_deployment",
        "model"
      ]
    },
    {
      "name": "bird_sql",
      "display_name": "BIRD-SQL (Dev)",
      "description": "BIRD-SQL (Dev)",
      "metric_groups": [
        "accuracy",
        "efficiency",
        "general_information"
      ],
      "subgroups": [],
      "subgroup_display_mode": "by_metric",
      "subgroup_metric_groups_hidden": [],
      "environment": {
        "main_name": "execution_accuracy",
        "main_split": "valid"
      },
      "category": "Scenarios",
      "visibility": "all_groups",
      "taxonomy": {
        "task": "text-to-SQL",
        "what": "databases from various domains",
        "when": "?",
        "who": "expert data scientists",
        "language": "English"
      },
      "todo": false,
      "adapter_keys_shown": [
        "model_deployment",
        "model"
      ]
    }
  ],
  "adapter": [
    {
      "name": "method",
      "description": "The high-level strategy for converting instances into a prompt for the language model."
    },
    {
      "name": "global_prefix",
      "description": "The string that is prepended to the entire prompt."
    },
    {
      "name": "global_suffix",
      "description": "The string that is appended to the entire prompt."
    },
    {
      "name": "instructions",
      "description": "The description of the task that is included at the very beginning of the prompt."
    },
    {
      "name": "input_prefix",
      "description": "The string that is included before each input (e.g., 'Question:')."
    },
    {
      "name": "input_suffix",
      "description": "The string that is included after each input (e.g., '\\n')."
    },
    {
      "name": "reference_prefix",
      "description": "The string that is included before each reference (for multiple-choice questions)."
    },
    {
      "name": "reference_suffix",
      "description": "The string that is included after each reference (for multiple-choice questions)."
    },
    {
      "name": "chain_of_thought_prefix",
      "description": "The string that is included before each chain of thought. (e.g., 'Let's think step by step')"
    },
    {
      "name": "chain_of_thought_suffix",
      "description": "The string that is included after each chain of thought. (e.g., 'The correct answer is')"
    },
    {
      "name": "output_prefix",
      "description": "The string that is included before the correct answer/predicted output (e.g., 'Answer:')."
    },
    {
      "name": "output_suffix",
      "description": "The string that is included after the correct answer/predicted output (e.g., '\\n')."
    },
    {
      "name": "instance_prefix",
      "description": "The string that is included before each instance (e.g., '\\n\\n')."
    },
    {
      "name": "substitutions",
      "description": "A list of regular expression substitutions (e.g., replacing '\\n' with ';\\n') to perform at the very end on the prompt."
    },
    {
      "name": "max_train_instances",
      "description": "Maximum number of training instances to include in the prompt (currently by randomly sampling)."
    },
    {
      "name": "max_eval_instances",
      "description": "Maximum number of instances to evaluate on (over all splits - test, valid, etc.)."
    },
    {
      "name": "num_outputs",
      "description": "Maximum number of possible outputs to generate by sampling multiple outputs."
    },
    {
      "name": "num_train_trials",
      "description": "Number of trials, where in each trial we choose an independent, random set of training instances. Used to compute variance."
    },
    {
      "name": "num_trials",
      "description": "Number of trials, where we query the model with the same requests, but different random seeds."
    },
    {
      "name": "sample_train",
      "description": "If true, randomly sample N training examples; if false, select N consecutive training examples"
    },
    {
      "name": "model_deployment",
      "description": "Name of the language model deployment (<host_organization>/<model name>) to send requests to."
    },
    {
      "name": "model",
      "description": "Name of the language model (<creator_organization>/<model name>) to send requests to."
    },
    {
      "name": "temperature",
      "description": "Temperature parameter used in generation."
    },
    {
      "name": "max_tokens",
      "description": "Maximum number of tokens to generate."
    },
    {
      "name": "stop_sequences",
      "description": "List of stop sequences. Output generation will be stopped if any stop sequence is encountered."
    },
    {
      "name": "random",
      "description": "Random seed (string), which guarantees reproducibility."
    },
    {
      "name": "multi_label",
      "description": "If true, for instances with multiple correct reference, the gold answer should be considered to be all of the correct references rather than any of the correct references."
    },
    {
      "name": "image_generation_parameters",
      "description": "Parameters for image generation."
    },
    {
      "name": "eval_splits",
      "description": "The splits from which evaluation instances will be drawn."
    }
  ],
  "models": [
    {
      "name": "anthropic/claude-3-5-haiku-20241022",
      "display_name": "Claude 3.5 Haiku (20241022)",
      "short_display_name": "Claude 3.5 Haiku (20241022)",
      "description": "Claude 3.5 Haiku is a Claude 3 family model which matches the performance of Claude 3 Opus at a similar speed to the previous generation of Haiku ([blog](https://www.anthropic.com/news/3-5-models-and-computer-use)).",
      "creator_organization": "Anthropic",
      "access": "limited",
      "todo": false,
      "release_date": "2024-11-04"
    },
    {
      "name": "anthropic/claude-3-5-sonnet-20240620",
      "display_name": "Claude 3.5 Sonnet (20240620)",
      "short_display_name": "Claude 3.5 Sonnet (20240620)",
      "description": "Claude 3.5 Sonnet is a Claude 3 family model which outperforms Claude 3 Opus while operating faster and at a lower cost. ([blog](https://www.anthropic.com/news/claude-3-5-sonnet))",
      "creator_organization": "Anthropic",
      "access": "limited",
      "todo": false,
      "release_date": "2024-06-20"
    },
    {
      "name": "google/gemini-1.5-pro-002",
      "display_name": "Gemini 1.5 Pro (002)",
      "short_display_name": "Gemini 1.5 Pro (002)",
      "description": "Gemini 1.5 Pro is a multimodal mixture-of-experts model capable of recalling and reasoning over fine-grained information from long contexts. This model is accessed through Vertex AI and has all safety thresholds set to `BLOCK_NONE`. ([paper](https://arxiv.org/abs/2403.05530))",
      "creator_organization": "Google",
      "access": "limited",
      "todo": false,
      "release_date": "2024-09-24"
    },
    {
      "name": "google/gemini-1.5-flash-002",
      "display_name": "Gemini 1.5 Flash (002)",
      "short_display_name": "Gemini 1.5 Flash (002)",
      "description": "Gemini 1.5 Flash is a multimodal mixture-of-experts model capable of recalling and reasoning over fine-grained information from long contexts. This model is accessed through Vertex AI and has all safety thresholds set to `BLOCK_NONE`. ([paper](https://arxiv.org/abs/2403.05530))",
      "creator_organization": "Google",
      "access": "limited",
      "todo": false,
      "release_date": "2024-09-24"
    },
    {
      "name": "meta/llama-3.1-8b-instruct-turbo",
      "display_name": "Llama 3.1 Instruct Turbo (8B)",
      "short_display_name": "Llama 3.1 Instruct Turbo (8B)",
      "description": "Llama 3.1 (8B) is part of the Llama 3 family of dense Transformer models that that natively support multilinguality, coding, reasoning, and tool usage. ([paper](https://ai.meta.com/research/publications/the-llama-3-herd-of-models/), [blog](https://ai.meta.com/blog/meta-llama-3-1/)) Turbo is Together's implementation, providing a near negligible difference in quality from the reference implementation with faster performance and lower cost, currently using FP8 quantization. ([blog](https://www.together.ai/blog/llama-31-quality))",
      "creator_organization": "Meta",
      "access": "open",
      "todo": false,
      "release_date": "2024-07-23",
      "num_parameters": 8000000000
    },
    {
      "name": "meta/llama-3.1-70b-instruct-turbo",
      "display_name": "Llama 3.1 Instruct Turbo (70B)",
      "short_display_name": "Llama 3.1 Instruct Turbo (70B)",
      "description": "Llama 3.1 (70B) is part of the Llama 3 family of dense Transformer models that that natively support multilinguality, coding, reasoning, and tool usage. ([paper](https://ai.meta.com/research/publications/the-llama-3-herd-of-models/), [blog](https://ai.meta.com/blog/meta-llama-3-1/)) Turbo is Together's implementation, providing a near negligible difference in quality from the reference implementation with faster performance and lower cost, currently using FP8 quantization. ([blog](https://www.together.ai/blog/llama-31-quality))",
      "creator_organization": "Meta",
      "access": "open",
      "todo": false,
      "release_date": "2024-07-23",
      "num_parameters": 70000000000
    },
    {
      "name": "meta/llama-3.1-405b-instruct-turbo",
      "display_name": "Llama 3.1 Instruct Turbo (405B)",
      "short_display_name": "Llama 3.1 Instruct Turbo (405B)",
      "description": "Llama 3.1 (405B) is part of the Llama 3 family of dense Transformer models that that natively support multilinguality, coding, reasoning, and tool usage. ([paper](https://ai.meta.com/research/publications/the-llama-3-herd-of-models/), [blog](https://ai.meta.com/blog/meta-llama-3-1/)) Turbo is Together's implementation, providing a near negligible difference in quality from the reference implementation with faster performance and lower cost, currently using FP8 quantization. ([blog](https://www.together.ai/blog/llama-31-quality))",
      "creator_organization": "Meta",
      "access": "open",
      "todo": false,
      "release_date": "2024-07-23",
      "num_parameters": 405000000000
    },
    {
      "name": "openai/gpt-4o-2024-08-06",
      "display_name": "GPT-4o (2024-08-06)",
      "short_display_name": "GPT-4o (2024-08-06)",
      "description": "GPT-4o (2024-08-06) is a large multimodal model that accepts as input any combination of text, audio, and image and generates any combination of text, audio, and image outputs. ([blog](https://openai.com/index/introducing-structured-outputs-in-the-api/))",
      "creator_organization": "OpenAI",
      "access": "limited",
      "todo": false,
      "release_date": "2024-08-06"
    },
    {
      "name": "openai/gpt-4o-mini-2024-07-18",
      "display_name": "GPT-4o mini (2024-07-18)",
      "short_display_name": "GPT-4o mini (2024-07-18)",
      "description": "GPT-4o mini (2024-07-18) is a multimodal model with a context window of 128K tokens and improved handling of non-English text. ([blog](https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/))",
      "creator_organization": "OpenAI",
      "access": "limited",
      "todo": false,
      "release_date": "2024-07-18"
    }
  ]
}