[
  {
    "run_path": "benchmark_output/runs/v0.4.0/czech_bank_qa:model=anthropic_claude-3-5-haiku-20241022",
    "run_spec": {
      "name": "czech_bank_qa:model=anthropic_claude-3-5-haiku-20241022",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.czech_bank_qa_scenario.CzechBankQAScenario",
        "args": {
          "config_name": "berka_queries_1024_2024_12_18"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Given a SQLite database schema and the following instructions, generate a SQLite query that corresponds to the instructions. Answer with only the query.\n\nDatabase schema:\nCREATE TABLE \"account\" (\n  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"frequency\" varchar(18) NOT NULL\n,  \"date\" date NOT NULL\n,  PRIMARY KEY (\"account_id\")\n,  CONSTRAINT \"account_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"card\" (\n  \"card_id\" integer NOT NULL DEFAULT '0'\n,  \"disp_id\" integer NOT NULL\n,  \"type\" varchar(7) NOT NULL\n,  \"issued\" date NOT NULL\n,  PRIMARY KEY (\"card_id\")\n,  CONSTRAINT \"card_ibfk_1\" FOREIGN KEY (\"disp_id\") REFERENCES \"disp\" (\"disp_id\")\n);\nCREATE TABLE \"client\" (\n  \"client_id\" integer NOT NULL\n,  \"gender\" varchar(1) NOT NULL\n,  \"birth_date\" date NOT NULL\n,  \"district_id\" integer NOT NULL\n,  PRIMARY KEY (\"client_id\")\n,  CONSTRAINT \"client_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"disp\" (\n  \"disp_id\" integer NOT NULL\n,  \"client_id\" integer NOT NULL\n,  \"account_id\" integer NOT NULL\n,  \"type\" varchar(9) NOT NULL\n,  PRIMARY KEY (\"disp_id\")\n,  CONSTRAINT \"disp_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n,  CONSTRAINT \"disp_ibfk_2\" FOREIGN KEY (\"client_id\") REFERENCES \"client\" (\"client_id\")\n);\nCREATE TABLE \"district\" (\n  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"A2\" varchar(19) NOT NULL\n,  \"A3\" varchar(15) NOT NULL\n,  \"A4\" integer NOT NULL\n,  \"A5\" integer NOT NULL\n,  \"A6\" integer NOT NULL\n,  \"A7\" integer NOT NULL\n,  \"A8\" integer NOT NULL\n,  \"A9\" integer NOT NULL\n,  \"A10\" decimal(4,1) NOT NULL\n,  \"A11\" integer NOT NULL\n,  \"A12\" decimal(4,1) DEFAULT NULL\n,  \"A13\" decimal(3,2) NOT NULL\n,  \"A14\" integer NOT NULL\n,  \"A15\" integer DEFAULT NULL\n,  \"A16\" integer NOT NULL\n,  PRIMARY KEY (\"district_id\")\n);\nCREATE TABLE \"loan\" (\n  \"loan_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"date\" date NOT NULL\n,  \"amount\" integer NOT NULL\n,  \"duration\" integer NOT NULL\n,  \"payments\" decimal(6,2) NOT NULL\n,  \"status\" varchar(1) NOT NULL\n,  PRIMARY KEY (\"loan_id\")\n,  CONSTRAINT \"loan_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"order\" (\n  \"order_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"bank_to\" varchar(2) NOT NULL\n,  \"account_to\" integer NOT NULL\n,  \"amount\" decimal(6,1) NOT NULL\n,  \"k_symbol\" varchar(8) NOT NULL\n,  PRIMARY KEY (\"order_id\")\n,  CONSTRAINT \"order_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"trans\" (\n  \"trans_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"date\" date NOT NULL\n,  \"type\" varchar(6) NOT NULL\n,  \"operation\" varchar(14) DEFAULT NULL\n,  \"amount\" integer NOT NULL\n,  \"balance\" integer NOT NULL\n,  \"k_symbol\" varchar(11) DEFAULT NULL\n,  \"bank\" varchar(2) DEFAULT NULL\n,  \"account\" integer  DEFAULT NULL\n,  PRIMARY KEY (\"trans_id\")\n,  CONSTRAINT \"trans_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\n",
        "input_prefix": "Instruction: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "SQL Query: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-3-5-haiku-20241022",
        "model": "anthropic/claude-3-5-haiku-20241022",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.czech_bank_qa_metrics.CzechBankQAMetrics",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "czech_bank_qa"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.czech_bank_qa_annotator.CzechBankQAAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1095.235294117647,
        "sum_squared": 1199540.349480969,
        "min": 1095.235294117647,
        "max": 1095.235294117647,
        "mean": 1095.235294117647,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 74.49019607843137,
        "sum_squared": 5548.789311803152,
        "min": 74.49019607843137,
        "max": 74.49019607843137,
        "mean": 74.49019607843137,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 74.49019607843137,
        "sum_squared": 5548.789311803152,
        "min": 74.49019607843137,
        "max": 74.49019607843137,
        "mean": 74.49019607843137,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 2.128670982286042,
        "sum_squared": 4.531240150826623,
        "min": 2.128670982286042,
        "max": 2.128670982286042,
        "mean": 2.128670982286042,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 74.49019607843137,
        "sum_squared": 5548.789311803152,
        "min": 74.49019607843137,
        "max": 74.49019607843137,
        "mean": 74.49019607843137,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 220.64705882352942,
        "sum_squared": 48685.12456747405,
        "min": 220.64705882352942,
        "max": 220.64705882352942,
        "mean": 220.64705882352942,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1095.235294117647,
        "sum_squared": 1199540.349480969,
        "min": 1095.235294117647,
        "max": 1095.235294117647,
        "mean": 1095.235294117647,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1095.235294117647,
        "sum_squared": 1199540.349480969,
        "min": 1095.235294117647,
        "max": 1095.235294117647,
        "mean": 1095.235294117647,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 74.49019607843137,
        "sum_squared": 5548.789311803152,
        "min": 74.49019607843137,
        "max": 74.49019607843137,
        "mean": 74.49019607843137,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 74.49019607843137,
        "sum_squared": 5548.789311803152,
        "min": 74.49019607843137,
        "max": 74.49019607843137,
        "mean": 74.49019607843137,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 74.49019607843137,
        "sum_squared": 5548.789311803152,
        "min": 74.49019607843137,
        "max": 74.49019607843137,
        "mean": 74.49019607843137,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 74.49019607843137,
        "sum_squared": 5548.789311803152,
        "min": 74.49019607843137,
        "max": 74.49019607843137,
        "mean": 74.49019607843137,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.128670982286042,
        "sum_squared": 4.531240150826623,
        "min": 2.128670982286042,
        "max": 2.128670982286042,
        "mean": 2.128670982286042,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.128670982286042,
        "sum_squared": 4.531240150826623,
        "min": 2.128670982286042,
        "max": 2.128670982286042,
        "mean": 2.128670982286042,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 74.49019607843137,
        "sum_squared": 5548.789311803152,
        "min": 74.49019607843137,
        "max": 74.49019607843137,
        "mean": 74.49019607843137,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 74.49019607843137,
        "sum_squared": 5548.789311803152,
        "min": 74.49019607843137,
        "max": 74.49019607843137,
        "mean": 74.49019607843137,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 220.64705882352942,
        "sum_squared": 48685.12456747405,
        "min": 220.64705882352942,
        "max": 220.64705882352942,
        "mean": 220.64705882352942,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 220.64705882352942,
        "sum_squared": 48685.12456747405,
        "min": 220.64705882352942,
        "max": 220.64705882352942,
        "mean": 220.64705882352942,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 102.0,
        "sum_squared": 10404.0,
        "min": 102.0,
        "max": 102.0,
        "mean": 102.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test"
        },
        "count": 1,
        "sum": 0.029411764705882353,
        "sum_squared": 0.0008650519031141869,
        "min": 0.029411764705882353,
        "max": 0.029411764705882353,
        "mean": 0.029411764705882353,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.029411764705882353,
        "sum_squared": 0.0008650519031141869,
        "min": 0.029411764705882353,
        "max": 0.029411764705882353,
        "mean": 0.029411764705882353,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.029411764705882353,
        "sum_squared": 0.0008650519031141869,
        "min": 0.029411764705882353,
        "max": 0.029411764705882353,
        "mean": 0.029411764705882353,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.4.0/czech_bank_qa:model=anthropic_claude-3-5-sonnet-20240620",
    "run_spec": {
      "name": "czech_bank_qa:model=anthropic_claude-3-5-sonnet-20240620",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.czech_bank_qa_scenario.CzechBankQAScenario",
        "args": {
          "config_name": "berka_queries_1024_2024_12_18"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Given a SQLite database schema and the following instructions, generate a SQLite query that corresponds to the instructions. Answer with only the query.\n\nDatabase schema:\nCREATE TABLE \"account\" (\n  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"frequency\" varchar(18) NOT NULL\n,  \"date\" date NOT NULL\n,  PRIMARY KEY (\"account_id\")\n,  CONSTRAINT \"account_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"card\" (\n  \"card_id\" integer NOT NULL DEFAULT '0'\n,  \"disp_id\" integer NOT NULL\n,  \"type\" varchar(7) NOT NULL\n,  \"issued\" date NOT NULL\n,  PRIMARY KEY (\"card_id\")\n,  CONSTRAINT \"card_ibfk_1\" FOREIGN KEY (\"disp_id\") REFERENCES \"disp\" (\"disp_id\")\n);\nCREATE TABLE \"client\" (\n  \"client_id\" integer NOT NULL\n,  \"gender\" varchar(1) NOT NULL\n,  \"birth_date\" date NOT NULL\n,  \"district_id\" integer NOT NULL\n,  PRIMARY KEY (\"client_id\")\n,  CONSTRAINT \"client_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"disp\" (\n  \"disp_id\" integer NOT NULL\n,  \"client_id\" integer NOT NULL\n,  \"account_id\" integer NOT NULL\n,  \"type\" varchar(9) NOT NULL\n,  PRIMARY KEY (\"disp_id\")\n,  CONSTRAINT \"disp_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n,  CONSTRAINT \"disp_ibfk_2\" FOREIGN KEY (\"client_id\") REFERENCES \"client\" (\"client_id\")\n);\nCREATE TABLE \"district\" (\n  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"A2\" varchar(19) NOT NULL\n,  \"A3\" varchar(15) NOT NULL\n,  \"A4\" integer NOT NULL\n,  \"A5\" integer NOT NULL\n,  \"A6\" integer NOT NULL\n,  \"A7\" integer NOT NULL\n,  \"A8\" integer NOT NULL\n,  \"A9\" integer NOT NULL\n,  \"A10\" decimal(4,1) NOT NULL\n,  \"A11\" integer NOT NULL\n,  \"A12\" decimal(4,1) DEFAULT NULL\n,  \"A13\" decimal(3,2) NOT NULL\n,  \"A14\" integer NOT NULL\n,  \"A15\" integer DEFAULT NULL\n,  \"A16\" integer NOT NULL\n,  PRIMARY KEY (\"district_id\")\n);\nCREATE TABLE \"loan\" (\n  \"loan_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"date\" date NOT NULL\n,  \"amount\" integer NOT NULL\n,  \"duration\" integer NOT NULL\n,  \"payments\" decimal(6,2) NOT NULL\n,  \"status\" varchar(1) NOT NULL\n,  PRIMARY KEY (\"loan_id\")\n,  CONSTRAINT \"loan_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"order\" (\n  \"order_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"bank_to\" varchar(2) NOT NULL\n,  \"account_to\" integer NOT NULL\n,  \"amount\" decimal(6,1) NOT NULL\n,  \"k_symbol\" varchar(8) NOT NULL\n,  PRIMARY KEY (\"order_id\")\n,  CONSTRAINT \"order_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"trans\" (\n  \"trans_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"date\" date NOT NULL\n,  \"type\" varchar(6) NOT NULL\n,  \"operation\" varchar(14) DEFAULT NULL\n,  \"amount\" integer NOT NULL\n,  \"balance\" integer NOT NULL\n,  \"k_symbol\" varchar(11) DEFAULT NULL\n,  \"bank\" varchar(2) DEFAULT NULL\n,  \"account\" integer  DEFAULT NULL\n,  PRIMARY KEY (\"trans_id\")\n,  CONSTRAINT \"trans_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\n",
        "input_prefix": "Instruction: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "SQL Query: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "anthropic/claude-3-5-sonnet-20240620",
        "model": "anthropic/claude-3-5-sonnet-20240620",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.czech_bank_qa_metrics.CzechBankQAMetrics",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "czech_bank_qa"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.czech_bank_qa_annotator.CzechBankQAAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1095.235294117647,
        "sum_squared": 1199540.349480969,
        "min": 1095.235294117647,
        "max": 1095.235294117647,
        "mean": 1095.235294117647,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 68.90196078431373,
        "sum_squared": 4747.480199923107,
        "min": 68.90196078431373,
        "max": 68.90196078431373,
        "mean": 68.90196078431373,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 68.90196078431373,
        "sum_squared": 4747.480199923107,
        "min": 68.90196078431373,
        "max": 68.90196078431373,
        "mean": 68.90196078431373,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.9445582441255158,
        "sum_squared": 3.781306764796509,
        "min": 1.9445582441255158,
        "max": 1.9445582441255158,
        "mean": 1.9445582441255158,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 68.90196078431373,
        "sum_squared": 4747.480199923107,
        "min": 68.90196078431373,
        "max": 68.90196078431373,
        "mean": 68.90196078431373,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 207.68627450980392,
        "sum_squared": 43133.58861976163,
        "min": 207.68627450980392,
        "max": 207.68627450980392,
        "mean": 207.68627450980392,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1095.235294117647,
        "sum_squared": 1199540.349480969,
        "min": 1095.235294117647,
        "max": 1095.235294117647,
        "mean": 1095.235294117647,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1095.235294117647,
        "sum_squared": 1199540.349480969,
        "min": 1095.235294117647,
        "max": 1095.235294117647,
        "mean": 1095.235294117647,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 68.90196078431373,
        "sum_squared": 4747.480199923107,
        "min": 68.90196078431373,
        "max": 68.90196078431373,
        "mean": 68.90196078431373,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 68.90196078431373,
        "sum_squared": 4747.480199923107,
        "min": 68.90196078431373,
        "max": 68.90196078431373,
        "mean": 68.90196078431373,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 68.90196078431373,
        "sum_squared": 4747.480199923107,
        "min": 68.90196078431373,
        "max": 68.90196078431373,
        "mean": 68.90196078431373,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 68.90196078431373,
        "sum_squared": 4747.480199923107,
        "min": 68.90196078431373,
        "max": 68.90196078431373,
        "mean": 68.90196078431373,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.9445582441255158,
        "sum_squared": 3.781306764796509,
        "min": 1.9445582441255158,
        "max": 1.9445582441255158,
        "mean": 1.9445582441255158,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.9445582441255158,
        "sum_squared": 3.781306764796509,
        "min": 1.9445582441255158,
        "max": 1.9445582441255158,
        "mean": 1.9445582441255158,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 68.90196078431373,
        "sum_squared": 4747.480199923107,
        "min": 68.90196078431373,
        "max": 68.90196078431373,
        "mean": 68.90196078431373,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 68.90196078431373,
        "sum_squared": 4747.480199923107,
        "min": 68.90196078431373,
        "max": 68.90196078431373,
        "mean": 68.90196078431373,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 207.68627450980392,
        "sum_squared": 43133.58861976163,
        "min": 207.68627450980392,
        "max": 207.68627450980392,
        "mean": 207.68627450980392,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 207.68627450980392,
        "sum_squared": 43133.58861976163,
        "min": 207.68627450980392,
        "max": 207.68627450980392,
        "mean": 207.68627450980392,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 102.0,
        "sum_squared": 10404.0,
        "min": 102.0,
        "max": 102.0,
        "mean": 102.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test"
        },
        "count": 1,
        "sum": 0.00980392156862745,
        "sum_squared": 9.611687812379854e-05,
        "min": 0.00980392156862745,
        "max": 0.00980392156862745,
        "mean": 0.00980392156862745,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.00980392156862745,
        "sum_squared": 9.611687812379854e-05,
        "min": 0.00980392156862745,
        "max": 0.00980392156862745,
        "mean": 0.00980392156862745,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.00980392156862745,
        "sum_squared": 9.611687812379854e-05,
        "min": 0.00980392156862745,
        "max": 0.00980392156862745,
        "mean": 0.00980392156862745,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.4.0/czech_bank_qa:model=google_gemini-1.5-flash-002",
    "run_spec": {
      "name": "czech_bank_qa:model=google_gemini-1.5-flash-002",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.czech_bank_qa_scenario.CzechBankQAScenario",
        "args": {
          "config_name": "berka_queries_1024_2024_12_18"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Given a SQLite database schema and the following instructions, generate a SQLite query that corresponds to the instructions. Answer with only the query.\n\nDatabase schema:\nCREATE TABLE \"account\" (\n  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"frequency\" varchar(18) NOT NULL\n,  \"date\" date NOT NULL\n,  PRIMARY KEY (\"account_id\")\n,  CONSTRAINT \"account_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"card\" (\n  \"card_id\" integer NOT NULL DEFAULT '0'\n,  \"disp_id\" integer NOT NULL\n,  \"type\" varchar(7) NOT NULL\n,  \"issued\" date NOT NULL\n,  PRIMARY KEY (\"card_id\")\n,  CONSTRAINT \"card_ibfk_1\" FOREIGN KEY (\"disp_id\") REFERENCES \"disp\" (\"disp_id\")\n);\nCREATE TABLE \"client\" (\n  \"client_id\" integer NOT NULL\n,  \"gender\" varchar(1) NOT NULL\n,  \"birth_date\" date NOT NULL\n,  \"district_id\" integer NOT NULL\n,  PRIMARY KEY (\"client_id\")\n,  CONSTRAINT \"client_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"disp\" (\n  \"disp_id\" integer NOT NULL\n,  \"client_id\" integer NOT NULL\n,  \"account_id\" integer NOT NULL\n,  \"type\" varchar(9) NOT NULL\n,  PRIMARY KEY (\"disp_id\")\n,  CONSTRAINT \"disp_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n,  CONSTRAINT \"disp_ibfk_2\" FOREIGN KEY (\"client_id\") REFERENCES \"client\" (\"client_id\")\n);\nCREATE TABLE \"district\" (\n  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"A2\" varchar(19) NOT NULL\n,  \"A3\" varchar(15) NOT NULL\n,  \"A4\" integer NOT NULL\n,  \"A5\" integer NOT NULL\n,  \"A6\" integer NOT NULL\n,  \"A7\" integer NOT NULL\n,  \"A8\" integer NOT NULL\n,  \"A9\" integer NOT NULL\n,  \"A10\" decimal(4,1) NOT NULL\n,  \"A11\" integer NOT NULL\n,  \"A12\" decimal(4,1) DEFAULT NULL\n,  \"A13\" decimal(3,2) NOT NULL\n,  \"A14\" integer NOT NULL\n,  \"A15\" integer DEFAULT NULL\n,  \"A16\" integer NOT NULL\n,  PRIMARY KEY (\"district_id\")\n);\nCREATE TABLE \"loan\" (\n  \"loan_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"date\" date NOT NULL\n,  \"amount\" integer NOT NULL\n,  \"duration\" integer NOT NULL\n,  \"payments\" decimal(6,2) NOT NULL\n,  \"status\" varchar(1) NOT NULL\n,  PRIMARY KEY (\"loan_id\")\n,  CONSTRAINT \"loan_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"order\" (\n  \"order_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"bank_to\" varchar(2) NOT NULL\n,  \"account_to\" integer NOT NULL\n,  \"amount\" decimal(6,1) NOT NULL\n,  \"k_symbol\" varchar(8) NOT NULL\n,  PRIMARY KEY (\"order_id\")\n,  CONSTRAINT \"order_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"trans\" (\n  \"trans_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"date\" date NOT NULL\n,  \"type\" varchar(6) NOT NULL\n,  \"operation\" varchar(14) DEFAULT NULL\n,  \"amount\" integer NOT NULL\n,  \"balance\" integer NOT NULL\n,  \"k_symbol\" varchar(11) DEFAULT NULL\n,  \"bank\" varchar(2) DEFAULT NULL\n,  \"account\" integer  DEFAULT NULL\n,  PRIMARY KEY (\"trans_id\")\n,  CONSTRAINT \"trans_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\n",
        "input_prefix": "Instruction: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "SQL Query: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/gemini-1.5-flash-002",
        "model": "google/gemini-1.5-flash-002",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [
          "\n\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.czech_bank_qa_metrics.CzechBankQAMetrics",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "czech_bank_qa"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.czech_bank_qa_annotator.CzechBankQAAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1090.2941176470588,
        "sum_squared": 1188741.2629757784,
        "min": 1090.2941176470588,
        "max": 1090.2941176470588,
        "mean": 1090.2941176470588,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.8210222697725483,
        "sum_squared": 0.6740775674624672,
        "min": 0.8210222697725483,
        "max": 0.8210222697725483,
        "mean": 0.8210222697725483,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1090.2941176470588,
        "sum_squared": 1188741.2629757784,
        "min": 1090.2941176470588,
        "max": 1090.2941176470588,
        "mean": 1090.2941176470588,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1090.2941176470588,
        "sum_squared": 1188741.2629757784,
        "min": 1090.2941176470588,
        "max": 1090.2941176470588,
        "mean": 1090.2941176470588,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8210222697725483,
        "sum_squared": 0.6740775674624672,
        "min": 0.8210222697725483,
        "max": 0.8210222697725483,
        "mean": 0.8210222697725483,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8210222697725483,
        "sum_squared": 0.6740775674624672,
        "min": 0.8210222697725483,
        "max": 0.8210222697725483,
        "mean": 0.8210222697725483,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 102.0,
        "sum_squared": 10404.0,
        "min": 102.0,
        "max": 102.0,
        "mean": 102.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test"
        },
        "count": 1,
        "sum": 0.16666666666666666,
        "sum_squared": 0.027777777777777776,
        "min": 0.16666666666666666,
        "max": 0.16666666666666666,
        "mean": 0.16666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.16666666666666666,
        "sum_squared": 0.027777777777777776,
        "min": 0.16666666666666666,
        "max": 0.16666666666666666,
        "mean": 0.16666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.16666666666666666,
        "sum_squared": 0.027777777777777776,
        "min": 0.16666666666666666,
        "max": 0.16666666666666666,
        "mean": 0.16666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.4.0/czech_bank_qa:model=google_gemini-1.5-pro-002",
    "run_spec": {
      "name": "czech_bank_qa:model=google_gemini-1.5-pro-002",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.czech_bank_qa_scenario.CzechBankQAScenario",
        "args": {
          "config_name": "berka_queries_1024_2024_12_18"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Given a SQLite database schema and the following instructions, generate a SQLite query that corresponds to the instructions. Answer with only the query.\n\nDatabase schema:\nCREATE TABLE \"account\" (\n  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"frequency\" varchar(18) NOT NULL\n,  \"date\" date NOT NULL\n,  PRIMARY KEY (\"account_id\")\n,  CONSTRAINT \"account_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"card\" (\n  \"card_id\" integer NOT NULL DEFAULT '0'\n,  \"disp_id\" integer NOT NULL\n,  \"type\" varchar(7) NOT NULL\n,  \"issued\" date NOT NULL\n,  PRIMARY KEY (\"card_id\")\n,  CONSTRAINT \"card_ibfk_1\" FOREIGN KEY (\"disp_id\") REFERENCES \"disp\" (\"disp_id\")\n);\nCREATE TABLE \"client\" (\n  \"client_id\" integer NOT NULL\n,  \"gender\" varchar(1) NOT NULL\n,  \"birth_date\" date NOT NULL\n,  \"district_id\" integer NOT NULL\n,  PRIMARY KEY (\"client_id\")\n,  CONSTRAINT \"client_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"disp\" (\n  \"disp_id\" integer NOT NULL\n,  \"client_id\" integer NOT NULL\n,  \"account_id\" integer NOT NULL\n,  \"type\" varchar(9) NOT NULL\n,  PRIMARY KEY (\"disp_id\")\n,  CONSTRAINT \"disp_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n,  CONSTRAINT \"disp_ibfk_2\" FOREIGN KEY (\"client_id\") REFERENCES \"client\" (\"client_id\")\n);\nCREATE TABLE \"district\" (\n  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"A2\" varchar(19) NOT NULL\n,  \"A3\" varchar(15) NOT NULL\n,  \"A4\" integer NOT NULL\n,  \"A5\" integer NOT NULL\n,  \"A6\" integer NOT NULL\n,  \"A7\" integer NOT NULL\n,  \"A8\" integer NOT NULL\n,  \"A9\" integer NOT NULL\n,  \"A10\" decimal(4,1) NOT NULL\n,  \"A11\" integer NOT NULL\n,  \"A12\" decimal(4,1) DEFAULT NULL\n,  \"A13\" decimal(3,2) NOT NULL\n,  \"A14\" integer NOT NULL\n,  \"A15\" integer DEFAULT NULL\n,  \"A16\" integer NOT NULL\n,  PRIMARY KEY (\"district_id\")\n);\nCREATE TABLE \"loan\" (\n  \"loan_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"date\" date NOT NULL\n,  \"amount\" integer NOT NULL\n,  \"duration\" integer NOT NULL\n,  \"payments\" decimal(6,2) NOT NULL\n,  \"status\" varchar(1) NOT NULL\n,  PRIMARY KEY (\"loan_id\")\n,  CONSTRAINT \"loan_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"order\" (\n  \"order_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"bank_to\" varchar(2) NOT NULL\n,  \"account_to\" integer NOT NULL\n,  \"amount\" decimal(6,1) NOT NULL\n,  \"k_symbol\" varchar(8) NOT NULL\n,  PRIMARY KEY (\"order_id\")\n,  CONSTRAINT \"order_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"trans\" (\n  \"trans_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"date\" date NOT NULL\n,  \"type\" varchar(6) NOT NULL\n,  \"operation\" varchar(14) DEFAULT NULL\n,  \"amount\" integer NOT NULL\n,  \"balance\" integer NOT NULL\n,  \"k_symbol\" varchar(11) DEFAULT NULL\n,  \"bank\" varchar(2) DEFAULT NULL\n,  \"account\" integer  DEFAULT NULL\n,  PRIMARY KEY (\"trans_id\")\n,  CONSTRAINT \"trans_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\n",
        "input_prefix": "Instruction: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "SQL Query: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "google/gemini-1.5-pro-002",
        "model": "google/gemini-1.5-pro-002",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [
          "\n\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.czech_bank_qa_metrics.CzechBankQAMetrics",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "czech_bank_qa"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.czech_bank_qa_annotator.CzechBankQAAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1090.2941176470588,
        "sum_squared": 1188741.2629757784,
        "min": 1090.2941176470588,
        "max": 1090.2941176470588,
        "mean": 1090.2941176470588,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 3.049022071501788,
        "sum_squared": 9.296535592505053,
        "min": 3.049022071501788,
        "max": 3.049022071501788,
        "mean": 3.049022071501788,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1090.2941176470588,
        "sum_squared": 1188741.2629757784,
        "min": 1090.2941176470588,
        "max": 1090.2941176470588,
        "mean": 1090.2941176470588,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1090.2941176470588,
        "sum_squared": 1188741.2629757784,
        "min": 1090.2941176470588,
        "max": 1090.2941176470588,
        "mean": 1090.2941176470588,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.049022071501788,
        "sum_squared": 9.296535592505053,
        "min": 3.049022071501788,
        "max": 3.049022071501788,
        "mean": 3.049022071501788,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 3.049022071501788,
        "sum_squared": 9.296535592505053,
        "min": 3.049022071501788,
        "max": 3.049022071501788,
        "mean": 3.049022071501788,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 102.0,
        "sum_squared": 10404.0,
        "min": 102.0,
        "max": 102.0,
        "mean": 102.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test"
        },
        "count": 1,
        "sum": 0.029411764705882353,
        "sum_squared": 0.0008650519031141869,
        "min": 0.029411764705882353,
        "max": 0.029411764705882353,
        "mean": 0.029411764705882353,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.029411764705882353,
        "sum_squared": 0.0008650519031141869,
        "min": 0.029411764705882353,
        "max": 0.029411764705882353,
        "mean": 0.029411764705882353,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.029411764705882353,
        "sum_squared": 0.0008650519031141869,
        "min": 0.029411764705882353,
        "max": 0.029411764705882353,
        "mean": 0.029411764705882353,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.4.0/czech_bank_qa:model=meta_llama-3.1-405b-instruct-turbo",
    "run_spec": {
      "name": "czech_bank_qa:model=meta_llama-3.1-405b-instruct-turbo",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.czech_bank_qa_scenario.CzechBankQAScenario",
        "args": {
          "config_name": "berka_queries_1024_2024_12_18"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Given a SQLite database schema and the following instructions, generate a SQLite query that corresponds to the instructions. Answer with only the query.\n\nDatabase schema:\nCREATE TABLE \"account\" (\n  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"frequency\" varchar(18) NOT NULL\n,  \"date\" date NOT NULL\n,  PRIMARY KEY (\"account_id\")\n,  CONSTRAINT \"account_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"card\" (\n  \"card_id\" integer NOT NULL DEFAULT '0'\n,  \"disp_id\" integer NOT NULL\n,  \"type\" varchar(7) NOT NULL\n,  \"issued\" date NOT NULL\n,  PRIMARY KEY (\"card_id\")\n,  CONSTRAINT \"card_ibfk_1\" FOREIGN KEY (\"disp_id\") REFERENCES \"disp\" (\"disp_id\")\n);\nCREATE TABLE \"client\" (\n  \"client_id\" integer NOT NULL\n,  \"gender\" varchar(1) NOT NULL\n,  \"birth_date\" date NOT NULL\n,  \"district_id\" integer NOT NULL\n,  PRIMARY KEY (\"client_id\")\n,  CONSTRAINT \"client_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"disp\" (\n  \"disp_id\" integer NOT NULL\n,  \"client_id\" integer NOT NULL\n,  \"account_id\" integer NOT NULL\n,  \"type\" varchar(9) NOT NULL\n,  PRIMARY KEY (\"disp_id\")\n,  CONSTRAINT \"disp_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n,  CONSTRAINT \"disp_ibfk_2\" FOREIGN KEY (\"client_id\") REFERENCES \"client\" (\"client_id\")\n);\nCREATE TABLE \"district\" (\n  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"A2\" varchar(19) NOT NULL\n,  \"A3\" varchar(15) NOT NULL\n,  \"A4\" integer NOT NULL\n,  \"A5\" integer NOT NULL\n,  \"A6\" integer NOT NULL\n,  \"A7\" integer NOT NULL\n,  \"A8\" integer NOT NULL\n,  \"A9\" integer NOT NULL\n,  \"A10\" decimal(4,1) NOT NULL\n,  \"A11\" integer NOT NULL\n,  \"A12\" decimal(4,1) DEFAULT NULL\n,  \"A13\" decimal(3,2) NOT NULL\n,  \"A14\" integer NOT NULL\n,  \"A15\" integer DEFAULT NULL\n,  \"A16\" integer NOT NULL\n,  PRIMARY KEY (\"district_id\")\n);\nCREATE TABLE \"loan\" (\n  \"loan_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"date\" date NOT NULL\n,  \"amount\" integer NOT NULL\n,  \"duration\" integer NOT NULL\n,  \"payments\" decimal(6,2) NOT NULL\n,  \"status\" varchar(1) NOT NULL\n,  PRIMARY KEY (\"loan_id\")\n,  CONSTRAINT \"loan_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"order\" (\n  \"order_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"bank_to\" varchar(2) NOT NULL\n,  \"account_to\" integer NOT NULL\n,  \"amount\" decimal(6,1) NOT NULL\n,  \"k_symbol\" varchar(8) NOT NULL\n,  PRIMARY KEY (\"order_id\")\n,  CONSTRAINT \"order_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"trans\" (\n  \"trans_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"date\" date NOT NULL\n,  \"type\" varchar(6) NOT NULL\n,  \"operation\" varchar(14) DEFAULT NULL\n,  \"amount\" integer NOT NULL\n,  \"balance\" integer NOT NULL\n,  \"k_symbol\" varchar(11) DEFAULT NULL\n,  \"bank\" varchar(2) DEFAULT NULL\n,  \"account\" integer  DEFAULT NULL\n,  PRIMARY KEY (\"trans_id\")\n,  CONSTRAINT \"trans_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\n",
        "input_prefix": "Instruction: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "SQL Query: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-3.1-405b-instruct-turbo",
        "model": "meta/llama-3.1-405b-instruct-turbo",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [
          "\n\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.czech_bank_qa_metrics.CzechBankQAMetrics",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "czech_bank_qa"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.czech_bank_qa_annotator.CzechBankQAAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 973.8137254901961,
        "sum_squared": 948313.1719530951,
        "min": 973.8137254901961,
        "max": 973.8137254901961,
        "mean": 973.8137254901961,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 59.81372549019608,
        "sum_squared": 3577.681757016532,
        "min": 59.81372549019608,
        "max": 59.81372549019608,
        "mean": 59.81372549019608,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 59.81372549019608,
        "sum_squared": 3577.681757016532,
        "min": 59.81372549019608,
        "max": 59.81372549019608,
        "mean": 59.81372549019608,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 2.9554142718221628,
        "sum_squared": 8.734473518090125,
        "min": 2.9554142718221628,
        "max": 2.9554142718221628,
        "mean": 2.9554142718221628,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": -3.4136547879293673,
        "sum_squared": 11.653039011153094,
        "min": -3.4136547879293673,
        "max": -3.4136547879293673,
        "mean": -3.4136547879293673,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 59.81372549019608,
        "sum_squared": 3577.681757016532,
        "min": 59.81372549019608,
        "max": 59.81372549019608,
        "mean": 59.81372549019608,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 210.2843137254902,
        "sum_squared": 44219.49259900038,
        "min": 210.2843137254902,
        "max": 210.2843137254902,
        "mean": 210.2843137254902,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.058731431829036,
        "sum_squared": 1.1209122447427606,
        "min": 1.058731431829036,
        "max": 1.058731431829036,
        "mean": 1.058731431829036,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.023420020003401626,
        "sum_squared": 0.0005484973369597323,
        "min": 0.023420020003401626,
        "max": 0.023420020003401626,
        "mean": 0.023420020003401626,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": -0.016233520834015358,
        "sum_squared": 0.00026352719866841066,
        "min": -0.016233520834015358,
        "max": -0.016233520834015358,
        "mean": -0.016233520834015358,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 973.8137254901961,
        "sum_squared": 948313.1719530951,
        "min": 973.8137254901961,
        "max": 973.8137254901961,
        "mean": 973.8137254901961,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 973.8137254901961,
        "sum_squared": 948313.1719530951,
        "min": 973.8137254901961,
        "max": 973.8137254901961,
        "mean": 973.8137254901961,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 59.81372549019608,
        "sum_squared": 3577.681757016532,
        "min": 59.81372549019608,
        "max": 59.81372549019608,
        "mean": 59.81372549019608,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 59.81372549019608,
        "sum_squared": 3577.681757016532,
        "min": 59.81372549019608,
        "max": 59.81372549019608,
        "mean": 59.81372549019608,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 59.81372549019608,
        "sum_squared": 3577.681757016532,
        "min": 59.81372549019608,
        "max": 59.81372549019608,
        "mean": 59.81372549019608,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 59.81372549019608,
        "sum_squared": 3577.681757016532,
        "min": 59.81372549019608,
        "max": 59.81372549019608,
        "mean": 59.81372549019608,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.9554142718221628,
        "sum_squared": 8.734473518090125,
        "min": 2.9554142718221628,
        "max": 2.9554142718221628,
        "mean": 2.9554142718221628,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 2.9554142718221628,
        "sum_squared": 8.734473518090125,
        "min": 2.9554142718221628,
        "max": 2.9554142718221628,
        "mean": 2.9554142718221628,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -3.4136547879293673,
        "sum_squared": 11.653039011153094,
        "min": -3.4136547879293673,
        "max": -3.4136547879293673,
        "mean": -3.4136547879293673,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -3.4136547879293673,
        "sum_squared": 11.653039011153094,
        "min": -3.4136547879293673,
        "max": -3.4136547879293673,
        "mean": -3.4136547879293673,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 59.81372549019608,
        "sum_squared": 3577.681757016532,
        "min": 59.81372549019608,
        "max": 59.81372549019608,
        "mean": 59.81372549019608,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 59.81372549019608,
        "sum_squared": 3577.681757016532,
        "min": 59.81372549019608,
        "max": 59.81372549019608,
        "mean": 59.81372549019608,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 210.2843137254902,
        "sum_squared": 44219.49259900038,
        "min": 210.2843137254902,
        "max": 210.2843137254902,
        "mean": 210.2843137254902,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 210.2843137254902,
        "sum_squared": 44219.49259900038,
        "min": 210.2843137254902,
        "max": 210.2843137254902,
        "mean": 210.2843137254902,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 102.0,
        "sum_squared": 10404.0,
        "min": 102.0,
        "max": 102.0,
        "mean": 102.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0196078431372549,
        "sum_squared": 0.00038446751249519417,
        "min": 0.0196078431372549,
        "max": 0.0196078431372549,
        "mean": 0.0196078431372549,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0196078431372549,
        "sum_squared": 0.00038446751249519417,
        "min": 0.0196078431372549,
        "max": 0.0196078431372549,
        "mean": 0.0196078431372549,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0196078431372549,
        "sum_squared": 0.00038446751249519417,
        "min": 0.0196078431372549,
        "max": 0.0196078431372549,
        "mean": 0.0196078431372549,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.4.0/czech_bank_qa:model=meta_llama-3.1-70b-instruct-turbo",
    "run_spec": {
      "name": "czech_bank_qa:model=meta_llama-3.1-70b-instruct-turbo",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.czech_bank_qa_scenario.CzechBankQAScenario",
        "args": {
          "config_name": "berka_queries_1024_2024_12_18"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Given a SQLite database schema and the following instructions, generate a SQLite query that corresponds to the instructions. Answer with only the query.\n\nDatabase schema:\nCREATE TABLE \"account\" (\n  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"frequency\" varchar(18) NOT NULL\n,  \"date\" date NOT NULL\n,  PRIMARY KEY (\"account_id\")\n,  CONSTRAINT \"account_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"card\" (\n  \"card_id\" integer NOT NULL DEFAULT '0'\n,  \"disp_id\" integer NOT NULL\n,  \"type\" varchar(7) NOT NULL\n,  \"issued\" date NOT NULL\n,  PRIMARY KEY (\"card_id\")\n,  CONSTRAINT \"card_ibfk_1\" FOREIGN KEY (\"disp_id\") REFERENCES \"disp\" (\"disp_id\")\n);\nCREATE TABLE \"client\" (\n  \"client_id\" integer NOT NULL\n,  \"gender\" varchar(1) NOT NULL\n,  \"birth_date\" date NOT NULL\n,  \"district_id\" integer NOT NULL\n,  PRIMARY KEY (\"client_id\")\n,  CONSTRAINT \"client_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"disp\" (\n  \"disp_id\" integer NOT NULL\n,  \"client_id\" integer NOT NULL\n,  \"account_id\" integer NOT NULL\n,  \"type\" varchar(9) NOT NULL\n,  PRIMARY KEY (\"disp_id\")\n,  CONSTRAINT \"disp_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n,  CONSTRAINT \"disp_ibfk_2\" FOREIGN KEY (\"client_id\") REFERENCES \"client\" (\"client_id\")\n);\nCREATE TABLE \"district\" (\n  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"A2\" varchar(19) NOT NULL\n,  \"A3\" varchar(15) NOT NULL\n,  \"A4\" integer NOT NULL\n,  \"A5\" integer NOT NULL\n,  \"A6\" integer NOT NULL\n,  \"A7\" integer NOT NULL\n,  \"A8\" integer NOT NULL\n,  \"A9\" integer NOT NULL\n,  \"A10\" decimal(4,1) NOT NULL\n,  \"A11\" integer NOT NULL\n,  \"A12\" decimal(4,1) DEFAULT NULL\n,  \"A13\" decimal(3,2) NOT NULL\n,  \"A14\" integer NOT NULL\n,  \"A15\" integer DEFAULT NULL\n,  \"A16\" integer NOT NULL\n,  PRIMARY KEY (\"district_id\")\n);\nCREATE TABLE \"loan\" (\n  \"loan_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"date\" date NOT NULL\n,  \"amount\" integer NOT NULL\n,  \"duration\" integer NOT NULL\n,  \"payments\" decimal(6,2) NOT NULL\n,  \"status\" varchar(1) NOT NULL\n,  PRIMARY KEY (\"loan_id\")\n,  CONSTRAINT \"loan_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"order\" (\n  \"order_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"bank_to\" varchar(2) NOT NULL\n,  \"account_to\" integer NOT NULL\n,  \"amount\" decimal(6,1) NOT NULL\n,  \"k_symbol\" varchar(8) NOT NULL\n,  PRIMARY KEY (\"order_id\")\n,  CONSTRAINT \"order_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"trans\" (\n  \"trans_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"date\" date NOT NULL\n,  \"type\" varchar(6) NOT NULL\n,  \"operation\" varchar(14) DEFAULT NULL\n,  \"amount\" integer NOT NULL\n,  \"balance\" integer NOT NULL\n,  \"k_symbol\" varchar(11) DEFAULT NULL\n,  \"bank\" varchar(2) DEFAULT NULL\n,  \"account\" integer  DEFAULT NULL\n,  PRIMARY KEY (\"trans_id\")\n,  CONSTRAINT \"trans_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\n",
        "input_prefix": "Instruction: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "SQL Query: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-3.1-70b-instruct-turbo",
        "model": "meta/llama-3.1-70b-instruct-turbo",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [
          "\n\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.czech_bank_qa_metrics.CzechBankQAMetrics",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "czech_bank_qa"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.czech_bank_qa_annotator.CzechBankQAAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 973.8137254901961,
        "sum_squared": 948313.1719530951,
        "min": 973.8137254901961,
        "max": 973.8137254901961,
        "mean": 973.8137254901961,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 51.990196078431374,
        "sum_squared": 2702.9804882737412,
        "min": 51.990196078431374,
        "max": 51.990196078431374,
        "mean": 51.990196078431374,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 51.990196078431374,
        "sum_squared": 2702.9804882737412,
        "min": 51.990196078431374,
        "max": 51.990196078431374,
        "mean": 51.990196078431374,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.577847908524906,
        "sum_squared": 0.3339082053866081,
        "min": 0.577847908524906,
        "max": 0.577847908524906,
        "mean": 0.577847908524906,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": -4.094729861218501,
        "sum_squared": 16.766812636354484,
        "min": -4.094729861218501,
        "max": -4.094729861218501,
        "mean": -4.094729861218501,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 51.990196078431374,
        "sum_squared": 2702.9804882737412,
        "min": 51.990196078431374,
        "max": 51.990196078431374,
        "mean": 51.990196078431374,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 173.54901960784315,
        "sum_squared": 30119.262206843527,
        "min": 173.54901960784315,
        "max": 173.54901960784315,
        "mean": 173.54901960784315,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0819442499844374,
        "sum_squared": 1.1706033600743868,
        "min": 1.0819442499844374,
        "max": 1.0819442499844374,
        "mean": 1.0819442499844374,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.03403906560756458,
        "sum_squared": 0.001158657987436086,
        "min": 0.03403906560756458,
        "max": 0.03403906560756458,
        "mean": 0.03403906560756458,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": -0.023594082354778393,
        "sum_squared": 0.0005566807221640651,
        "min": -0.023594082354778393,
        "max": -0.023594082354778393,
        "mean": -0.023594082354778393,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 973.8137254901961,
        "sum_squared": 948313.1719530951,
        "min": 973.8137254901961,
        "max": 973.8137254901961,
        "mean": 973.8137254901961,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 973.8137254901961,
        "sum_squared": 948313.1719530951,
        "min": 973.8137254901961,
        "max": 973.8137254901961,
        "mean": 973.8137254901961,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 51.990196078431374,
        "sum_squared": 2702.9804882737412,
        "min": 51.990196078431374,
        "max": 51.990196078431374,
        "mean": 51.990196078431374,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 51.990196078431374,
        "sum_squared": 2702.9804882737412,
        "min": 51.990196078431374,
        "max": 51.990196078431374,
        "mean": 51.990196078431374,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 51.990196078431374,
        "sum_squared": 2702.9804882737412,
        "min": 51.990196078431374,
        "max": 51.990196078431374,
        "mean": 51.990196078431374,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 51.990196078431374,
        "sum_squared": 2702.9804882737412,
        "min": 51.990196078431374,
        "max": 51.990196078431374,
        "mean": 51.990196078431374,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.577847908524906,
        "sum_squared": 0.3339082053866081,
        "min": 0.577847908524906,
        "max": 0.577847908524906,
        "mean": 0.577847908524906,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.577847908524906,
        "sum_squared": 0.3339082053866081,
        "min": 0.577847908524906,
        "max": 0.577847908524906,
        "mean": 0.577847908524906,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -4.094729861218501,
        "sum_squared": 16.766812636354484,
        "min": -4.094729861218501,
        "max": -4.094729861218501,
        "mean": -4.094729861218501,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -4.094729861218501,
        "sum_squared": 16.766812636354484,
        "min": -4.094729861218501,
        "max": -4.094729861218501,
        "mean": -4.094729861218501,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 51.990196078431374,
        "sum_squared": 2702.9804882737412,
        "min": 51.990196078431374,
        "max": 51.990196078431374,
        "mean": 51.990196078431374,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 51.990196078431374,
        "sum_squared": 2702.9804882737412,
        "min": 51.990196078431374,
        "max": 51.990196078431374,
        "mean": 51.990196078431374,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 173.54901960784315,
        "sum_squared": 30119.262206843527,
        "min": 173.54901960784315,
        "max": 173.54901960784315,
        "mean": 173.54901960784315,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 173.54901960784315,
        "sum_squared": 30119.262206843527,
        "min": 173.54901960784315,
        "max": 173.54901960784315,
        "mean": 173.54901960784315,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 102.0,
        "sum_squared": 10404.0,
        "min": 102.0,
        "max": 102.0,
        "mean": 102.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test"
        },
        "count": 1,
        "sum": 0.13725490196078433,
        "sum_squared": 0.01883890811226452,
        "min": 0.13725490196078433,
        "max": 0.13725490196078433,
        "mean": 0.13725490196078433,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.13725490196078433,
        "sum_squared": 0.01883890811226452,
        "min": 0.13725490196078433,
        "max": 0.13725490196078433,
        "mean": 0.13725490196078433,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.13725490196078433,
        "sum_squared": 0.01883890811226452,
        "min": 0.13725490196078433,
        "max": 0.13725490196078433,
        "mean": 0.13725490196078433,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.4.0/czech_bank_qa:model=meta_llama-3.1-8b-instruct-turbo",
    "run_spec": {
      "name": "czech_bank_qa:model=meta_llama-3.1-8b-instruct-turbo",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.czech_bank_qa_scenario.CzechBankQAScenario",
        "args": {
          "config_name": "berka_queries_1024_2024_12_18"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Given a SQLite database schema and the following instructions, generate a SQLite query that corresponds to the instructions. Answer with only the query.\n\nDatabase schema:\nCREATE TABLE \"account\" (\n  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"frequency\" varchar(18) NOT NULL\n,  \"date\" date NOT NULL\n,  PRIMARY KEY (\"account_id\")\n,  CONSTRAINT \"account_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"card\" (\n  \"card_id\" integer NOT NULL DEFAULT '0'\n,  \"disp_id\" integer NOT NULL\n,  \"type\" varchar(7) NOT NULL\n,  \"issued\" date NOT NULL\n,  PRIMARY KEY (\"card_id\")\n,  CONSTRAINT \"card_ibfk_1\" FOREIGN KEY (\"disp_id\") REFERENCES \"disp\" (\"disp_id\")\n);\nCREATE TABLE \"client\" (\n  \"client_id\" integer NOT NULL\n,  \"gender\" varchar(1) NOT NULL\n,  \"birth_date\" date NOT NULL\n,  \"district_id\" integer NOT NULL\n,  PRIMARY KEY (\"client_id\")\n,  CONSTRAINT \"client_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"disp\" (\n  \"disp_id\" integer NOT NULL\n,  \"client_id\" integer NOT NULL\n,  \"account_id\" integer NOT NULL\n,  \"type\" varchar(9) NOT NULL\n,  PRIMARY KEY (\"disp_id\")\n,  CONSTRAINT \"disp_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n,  CONSTRAINT \"disp_ibfk_2\" FOREIGN KEY (\"client_id\") REFERENCES \"client\" (\"client_id\")\n);\nCREATE TABLE \"district\" (\n  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"A2\" varchar(19) NOT NULL\n,  \"A3\" varchar(15) NOT NULL\n,  \"A4\" integer NOT NULL\n,  \"A5\" integer NOT NULL\n,  \"A6\" integer NOT NULL\n,  \"A7\" integer NOT NULL\n,  \"A8\" integer NOT NULL\n,  \"A9\" integer NOT NULL\n,  \"A10\" decimal(4,1) NOT NULL\n,  \"A11\" integer NOT NULL\n,  \"A12\" decimal(4,1) DEFAULT NULL\n,  \"A13\" decimal(3,2) NOT NULL\n,  \"A14\" integer NOT NULL\n,  \"A15\" integer DEFAULT NULL\n,  \"A16\" integer NOT NULL\n,  PRIMARY KEY (\"district_id\")\n);\nCREATE TABLE \"loan\" (\n  \"loan_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"date\" date NOT NULL\n,  \"amount\" integer NOT NULL\n,  \"duration\" integer NOT NULL\n,  \"payments\" decimal(6,2) NOT NULL\n,  \"status\" varchar(1) NOT NULL\n,  PRIMARY KEY (\"loan_id\")\n,  CONSTRAINT \"loan_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"order\" (\n  \"order_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"bank_to\" varchar(2) NOT NULL\n,  \"account_to\" integer NOT NULL\n,  \"amount\" decimal(6,1) NOT NULL\n,  \"k_symbol\" varchar(8) NOT NULL\n,  PRIMARY KEY (\"order_id\")\n,  CONSTRAINT \"order_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"trans\" (\n  \"trans_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"date\" date NOT NULL\n,  \"type\" varchar(6) NOT NULL\n,  \"operation\" varchar(14) DEFAULT NULL\n,  \"amount\" integer NOT NULL\n,  \"balance\" integer NOT NULL\n,  \"k_symbol\" varchar(11) DEFAULT NULL\n,  \"bank\" varchar(2) DEFAULT NULL\n,  \"account\" integer  DEFAULT NULL\n,  PRIMARY KEY (\"trans_id\")\n,  CONSTRAINT \"trans_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\n",
        "input_prefix": "Instruction: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "SQL Query: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/llama-3.1-8b-instruct-turbo",
        "model": "meta/llama-3.1-8b-instruct-turbo",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [
          "\n\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.czech_bank_qa_metrics.CzechBankQAMetrics",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "czech_bank_qa"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.czech_bank_qa_annotator.CzechBankQAAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 973.8137254901961,
        "sum_squared": 948313.1719530951,
        "min": 973.8137254901961,
        "max": 973.8137254901961,
        "mean": 973.8137254901961,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 54.72549019607843,
        "sum_squared": 2994.8792772010765,
        "min": 54.72549019607843,
        "max": 54.72549019607843,
        "mean": 54.72549019607843,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 54.72549019607843,
        "sum_squared": 2994.8792772010765,
        "min": 54.72549019607843,
        "max": 54.72549019607843,
        "mean": 54.72549019607843,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.35602209614772423,
        "sum_squared": 0.1267517329454194,
        "min": 0.35602209614772423,
        "max": 0.35602209614772423,
        "mean": 0.35602209614772423,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": -5.411608814940701,
        "sum_squared": 29.2855099659439,
        "min": -5.411608814940701,
        "max": -5.411608814940701,
        "mean": -5.411608814940701,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 54.72549019607843,
        "sum_squared": 2994.8792772010765,
        "min": 54.72549019607843,
        "max": 54.72549019607843,
        "mean": 54.72549019607843,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 179.7549019607843,
        "sum_squared": 32311.82477893118,
        "min": 179.7549019607843,
        "max": 179.7549019607843,
        "mean": 179.7549019607843,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.1039409271373646,
        "sum_squared": 1.2186855706089041,
        "min": 1.1039409271373646,
        "max": 1.1039409271373646,
        "mean": 1.1039409271373646,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.04343303640336443,
        "sum_squared": 0.0018864286512159801,
        "min": 0.04343303640336443,
        "max": 0.04343303640336443,
        "mean": 0.04343303640336443,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": -0.030105486726149523,
        "sum_squared": 0.0009063403310183651,
        "min": -0.030105486726149523,
        "max": -0.030105486726149523,
        "mean": -0.030105486726149523,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 973.8137254901961,
        "sum_squared": 948313.1719530951,
        "min": 973.8137254901961,
        "max": 973.8137254901961,
        "mean": 973.8137254901961,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 973.8137254901961,
        "sum_squared": 948313.1719530951,
        "min": 973.8137254901961,
        "max": 973.8137254901961,
        "mean": 973.8137254901961,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 54.72549019607843,
        "sum_squared": 2994.8792772010765,
        "min": 54.72549019607843,
        "max": 54.72549019607843,
        "mean": 54.72549019607843,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 54.72549019607843,
        "sum_squared": 2994.8792772010765,
        "min": 54.72549019607843,
        "max": 54.72549019607843,
        "mean": 54.72549019607843,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 54.72549019607843,
        "sum_squared": 2994.8792772010765,
        "min": 54.72549019607843,
        "max": 54.72549019607843,
        "mean": 54.72549019607843,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 54.72549019607843,
        "sum_squared": 2994.8792772010765,
        "min": 54.72549019607843,
        "max": 54.72549019607843,
        "mean": 54.72549019607843,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.35602209614772423,
        "sum_squared": 0.1267517329454194,
        "min": 0.35602209614772423,
        "max": 0.35602209614772423,
        "mean": 0.35602209614772423,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.35602209614772423,
        "sum_squared": 0.1267517329454194,
        "min": 0.35602209614772423,
        "max": 0.35602209614772423,
        "mean": 0.35602209614772423,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -5.411608814940701,
        "sum_squared": 29.2855099659439,
        "min": -5.411608814940701,
        "max": -5.411608814940701,
        "mean": -5.411608814940701,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -5.411608814940701,
        "sum_squared": 29.2855099659439,
        "min": -5.411608814940701,
        "max": -5.411608814940701,
        "mean": -5.411608814940701,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 54.72549019607843,
        "sum_squared": 2994.8792772010765,
        "min": 54.72549019607843,
        "max": 54.72549019607843,
        "mean": 54.72549019607843,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 54.72549019607843,
        "sum_squared": 2994.8792772010765,
        "min": 54.72549019607843,
        "max": 54.72549019607843,
        "mean": 54.72549019607843,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 179.7549019607843,
        "sum_squared": 32311.82477893118,
        "min": 179.7549019607843,
        "max": 179.7549019607843,
        "mean": 179.7549019607843,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 179.7549019607843,
        "sum_squared": 32311.82477893118,
        "min": 179.7549019607843,
        "max": 179.7549019607843,
        "mean": 179.7549019607843,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 102.0,
        "sum_squared": 10404.0,
        "min": 102.0,
        "max": 102.0,
        "mean": 102.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test"
        },
        "count": 1,
        "sum": 0.09803921568627451,
        "sum_squared": 0.009611687812379853,
        "min": 0.09803921568627451,
        "max": 0.09803921568627451,
        "mean": 0.09803921568627451,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.09803921568627451,
        "sum_squared": 0.009611687812379853,
        "min": 0.09803921568627451,
        "max": 0.09803921568627451,
        "mean": 0.09803921568627451,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.09803921568627451,
        "sum_squared": 0.009611687812379853,
        "min": 0.09803921568627451,
        "max": 0.09803921568627451,
        "mean": 0.09803921568627451,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.4.0/czech_bank_qa:model=mistralai_mistral-7b-instruct-v0.3",
    "run_spec": {
      "name": "czech_bank_qa:model=mistralai_mistral-7b-instruct-v0.3",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.czech_bank_qa_scenario.CzechBankQAScenario",
        "args": {
          "config_name": "berka_queries_1024_2024_12_18"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Given a SQLite database schema and the following instructions, generate a SQLite query that corresponds to the instructions. Answer with only the query.\n\nDatabase schema:\nCREATE TABLE \"account\" (\n  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"frequency\" varchar(18) NOT NULL\n,  \"date\" date NOT NULL\n,  PRIMARY KEY (\"account_id\")\n,  CONSTRAINT \"account_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"card\" (\n  \"card_id\" integer NOT NULL DEFAULT '0'\n,  \"disp_id\" integer NOT NULL\n,  \"type\" varchar(7) NOT NULL\n,  \"issued\" date NOT NULL\n,  PRIMARY KEY (\"card_id\")\n,  CONSTRAINT \"card_ibfk_1\" FOREIGN KEY (\"disp_id\") REFERENCES \"disp\" (\"disp_id\")\n);\nCREATE TABLE \"client\" (\n  \"client_id\" integer NOT NULL\n,  \"gender\" varchar(1) NOT NULL\n,  \"birth_date\" date NOT NULL\n,  \"district_id\" integer NOT NULL\n,  PRIMARY KEY (\"client_id\")\n,  CONSTRAINT \"client_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"disp\" (\n  \"disp_id\" integer NOT NULL\n,  \"client_id\" integer NOT NULL\n,  \"account_id\" integer NOT NULL\n,  \"type\" varchar(9) NOT NULL\n,  PRIMARY KEY (\"disp_id\")\n,  CONSTRAINT \"disp_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n,  CONSTRAINT \"disp_ibfk_2\" FOREIGN KEY (\"client_id\") REFERENCES \"client\" (\"client_id\")\n);\nCREATE TABLE \"district\" (\n  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"A2\" varchar(19) NOT NULL\n,  \"A3\" varchar(15) NOT NULL\n,  \"A4\" integer NOT NULL\n,  \"A5\" integer NOT NULL\n,  \"A6\" integer NOT NULL\n,  \"A7\" integer NOT NULL\n,  \"A8\" integer NOT NULL\n,  \"A9\" integer NOT NULL\n,  \"A10\" decimal(4,1) NOT NULL\n,  \"A11\" integer NOT NULL\n,  \"A12\" decimal(4,1) DEFAULT NULL\n,  \"A13\" decimal(3,2) NOT NULL\n,  \"A14\" integer NOT NULL\n,  \"A15\" integer DEFAULT NULL\n,  \"A16\" integer NOT NULL\n,  PRIMARY KEY (\"district_id\")\n);\nCREATE TABLE \"loan\" (\n  \"loan_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"date\" date NOT NULL\n,  \"amount\" integer NOT NULL\n,  \"duration\" integer NOT NULL\n,  \"payments\" decimal(6,2) NOT NULL\n,  \"status\" varchar(1) NOT NULL\n,  PRIMARY KEY (\"loan_id\")\n,  CONSTRAINT \"loan_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"order\" (\n  \"order_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"bank_to\" varchar(2) NOT NULL\n,  \"account_to\" integer NOT NULL\n,  \"amount\" decimal(6,1) NOT NULL\n,  \"k_symbol\" varchar(8) NOT NULL\n,  PRIMARY KEY (\"order_id\")\n,  CONSTRAINT \"order_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"trans\" (\n  \"trans_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"date\" date NOT NULL\n,  \"type\" varchar(6) NOT NULL\n,  \"operation\" varchar(14) DEFAULT NULL\n,  \"amount\" integer NOT NULL\n,  \"balance\" integer NOT NULL\n,  \"k_symbol\" varchar(11) DEFAULT NULL\n,  \"bank\" varchar(2) DEFAULT NULL\n,  \"account\" integer  DEFAULT NULL\n,  PRIMARY KEY (\"trans_id\")\n,  CONSTRAINT \"trans_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\n",
        "input_prefix": "Instruction: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "SQL Query: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/mistral-7b-instruct-v0.3",
        "model": "mistralai/mistral-7b-instruct-v0.3",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [
          "\n\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.czech_bank_qa_metrics.CzechBankQAMetrics",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "czech_bank_qa"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.czech_bank_qa_annotator.CzechBankQAAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1205.7254901960785,
        "sum_squared": 1453773.9577085737,
        "min": 1205.7254901960785,
        "max": 1205.7254901960785,
        "mean": 1205.7254901960785,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 65.88235294117646,
        "sum_squared": 4340.484429065743,
        "min": 65.88235294117646,
        "max": 65.88235294117646,
        "mean": 65.88235294117646,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 65.88235294117646,
        "sum_squared": 4340.484429065743,
        "min": 65.88235294117646,
        "max": 65.88235294117646,
        "mean": 65.88235294117646,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.719674264683443,
        "sum_squared": 0.5179310472476544,
        "min": 0.719674264683443,
        "max": 0.719674264683443,
        "mean": 0.719674264683443,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": -3.2593056605527195,
        "sum_squared": 10.623073388911,
        "min": -3.2593056605527195,
        "max": -3.2593056605527195,
        "mean": -3.2593056605527195,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 65.88235294117646,
        "sum_squared": 4340.484429065743,
        "min": 65.88235294117646,
        "max": 65.88235294117646,
        "mean": 65.88235294117646,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 196.00980392156862,
        "sum_squared": 38419.84323337178,
        "min": 196.00980392156862,
        "max": 196.00980392156862,
        "mean": 196.00980392156862,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0507157554315079,
        "sum_squared": 1.1040035987120043,
        "min": 1.0507157554315079,
        "max": 1.0507157554315079,
        "mean": 1.0507157554315079,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.02398953531478593,
        "sum_squared": 0.0005754978046193613,
        "min": 0.02398953531478593,
        "max": 0.02398953531478593,
        "mean": 0.02398953531478593,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": -0.016628278766387106,
        "sum_squared": 0.0002764996547326803,
        "min": -0.016628278766387106,
        "max": -0.016628278766387106,
        "mean": -0.016628278766387106,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1205.7254901960785,
        "sum_squared": 1453773.9577085737,
        "min": 1205.7254901960785,
        "max": 1205.7254901960785,
        "mean": 1205.7254901960785,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1205.7254901960785,
        "sum_squared": 1453773.9577085737,
        "min": 1205.7254901960785,
        "max": 1205.7254901960785,
        "mean": 1205.7254901960785,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 65.88235294117646,
        "sum_squared": 4340.484429065743,
        "min": 65.88235294117646,
        "max": 65.88235294117646,
        "mean": 65.88235294117646,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 65.88235294117646,
        "sum_squared": 4340.484429065743,
        "min": 65.88235294117646,
        "max": 65.88235294117646,
        "mean": 65.88235294117646,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 65.88235294117646,
        "sum_squared": 4340.484429065743,
        "min": 65.88235294117646,
        "max": 65.88235294117646,
        "mean": 65.88235294117646,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 65.88235294117646,
        "sum_squared": 4340.484429065743,
        "min": 65.88235294117646,
        "max": 65.88235294117646,
        "mean": 65.88235294117646,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.719674264683443,
        "sum_squared": 0.5179310472476544,
        "min": 0.719674264683443,
        "max": 0.719674264683443,
        "mean": 0.719674264683443,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.719674264683443,
        "sum_squared": 0.5179310472476544,
        "min": 0.719674264683443,
        "max": 0.719674264683443,
        "mean": 0.719674264683443,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -3.2593056605527195,
        "sum_squared": 10.623073388911,
        "min": -3.2593056605527195,
        "max": -3.2593056605527195,
        "mean": -3.2593056605527195,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -3.2593056605527195,
        "sum_squared": 10.623073388911,
        "min": -3.2593056605527195,
        "max": -3.2593056605527195,
        "mean": -3.2593056605527195,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 65.88235294117646,
        "sum_squared": 4340.484429065743,
        "min": 65.88235294117646,
        "max": 65.88235294117646,
        "mean": 65.88235294117646,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 65.88235294117646,
        "sum_squared": 4340.484429065743,
        "min": 65.88235294117646,
        "max": 65.88235294117646,
        "mean": 65.88235294117646,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 196.00980392156862,
        "sum_squared": 38419.84323337178,
        "min": 196.00980392156862,
        "max": 196.00980392156862,
        "mean": 196.00980392156862,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 196.00980392156862,
        "sum_squared": 38419.84323337178,
        "min": 196.00980392156862,
        "max": 196.00980392156862,
        "mean": 196.00980392156862,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 102.0,
        "sum_squared": 10404.0,
        "min": 102.0,
        "max": 102.0,
        "mean": 102.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test"
        },
        "count": 1,
        "sum": 0.47058823529411764,
        "sum_squared": 0.22145328719723184,
        "min": 0.47058823529411764,
        "max": 0.47058823529411764,
        "mean": 0.47058823529411764,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47058823529411764,
        "sum_squared": 0.22145328719723184,
        "min": 0.47058823529411764,
        "max": 0.47058823529411764,
        "mean": 0.47058823529411764,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.47058823529411764,
        "sum_squared": 0.22145328719723184,
        "min": 0.47058823529411764,
        "max": 0.47058823529411764,
        "mean": 0.47058823529411764,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.4.0/czech_bank_qa:model=mistralai_mixtral-8x22b-instruct-v0.1",
    "run_spec": {
      "name": "czech_bank_qa:model=mistralai_mixtral-8x22b-instruct-v0.1",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.czech_bank_qa_scenario.CzechBankQAScenario",
        "args": {
          "config_name": "berka_queries_1024_2024_12_18"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Given a SQLite database schema and the following instructions, generate a SQLite query that corresponds to the instructions. Answer with only the query.\n\nDatabase schema:\nCREATE TABLE \"account\" (\n  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"frequency\" varchar(18) NOT NULL\n,  \"date\" date NOT NULL\n,  PRIMARY KEY (\"account_id\")\n,  CONSTRAINT \"account_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"card\" (\n  \"card_id\" integer NOT NULL DEFAULT '0'\n,  \"disp_id\" integer NOT NULL\n,  \"type\" varchar(7) NOT NULL\n,  \"issued\" date NOT NULL\n,  PRIMARY KEY (\"card_id\")\n,  CONSTRAINT \"card_ibfk_1\" FOREIGN KEY (\"disp_id\") REFERENCES \"disp\" (\"disp_id\")\n);\nCREATE TABLE \"client\" (\n  \"client_id\" integer NOT NULL\n,  \"gender\" varchar(1) NOT NULL\n,  \"birth_date\" date NOT NULL\n,  \"district_id\" integer NOT NULL\n,  PRIMARY KEY (\"client_id\")\n,  CONSTRAINT \"client_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"disp\" (\n  \"disp_id\" integer NOT NULL\n,  \"client_id\" integer NOT NULL\n,  \"account_id\" integer NOT NULL\n,  \"type\" varchar(9) NOT NULL\n,  PRIMARY KEY (\"disp_id\")\n,  CONSTRAINT \"disp_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n,  CONSTRAINT \"disp_ibfk_2\" FOREIGN KEY (\"client_id\") REFERENCES \"client\" (\"client_id\")\n);\nCREATE TABLE \"district\" (\n  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"A2\" varchar(19) NOT NULL\n,  \"A3\" varchar(15) NOT NULL\n,  \"A4\" integer NOT NULL\n,  \"A5\" integer NOT NULL\n,  \"A6\" integer NOT NULL\n,  \"A7\" integer NOT NULL\n,  \"A8\" integer NOT NULL\n,  \"A9\" integer NOT NULL\n,  \"A10\" decimal(4,1) NOT NULL\n,  \"A11\" integer NOT NULL\n,  \"A12\" decimal(4,1) DEFAULT NULL\n,  \"A13\" decimal(3,2) NOT NULL\n,  \"A14\" integer NOT NULL\n,  \"A15\" integer DEFAULT NULL\n,  \"A16\" integer NOT NULL\n,  PRIMARY KEY (\"district_id\")\n);\nCREATE TABLE \"loan\" (\n  \"loan_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"date\" date NOT NULL\n,  \"amount\" integer NOT NULL\n,  \"duration\" integer NOT NULL\n,  \"payments\" decimal(6,2) NOT NULL\n,  \"status\" varchar(1) NOT NULL\n,  PRIMARY KEY (\"loan_id\")\n,  CONSTRAINT \"loan_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"order\" (\n  \"order_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"bank_to\" varchar(2) NOT NULL\n,  \"account_to\" integer NOT NULL\n,  \"amount\" decimal(6,1) NOT NULL\n,  \"k_symbol\" varchar(8) NOT NULL\n,  PRIMARY KEY (\"order_id\")\n,  CONSTRAINT \"order_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"trans\" (\n  \"trans_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"date\" date NOT NULL\n,  \"type\" varchar(6) NOT NULL\n,  \"operation\" varchar(14) DEFAULT NULL\n,  \"amount\" integer NOT NULL\n,  \"balance\" integer NOT NULL\n,  \"k_symbol\" varchar(11) DEFAULT NULL\n,  \"bank\" varchar(2) DEFAULT NULL\n,  \"account\" integer  DEFAULT NULL\n,  PRIMARY KEY (\"trans_id\")\n,  CONSTRAINT \"trans_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\n",
        "input_prefix": "Instruction: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "SQL Query: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/mixtral-8x22b-instruct-v0.1",
        "model": "mistralai/mixtral-8x22b-instruct-v0.1",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [
          "\n\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.czech_bank_qa_metrics.CzechBankQAMetrics",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "czech_bank_qa"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.czech_bank_qa_annotator.CzechBankQAAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1205.7254901960785,
        "sum_squared": 1453773.9577085737,
        "min": 1205.7254901960785,
        "max": 1205.7254901960785,
        "mean": 1205.7254901960785,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 45.627450980392155,
        "sum_squared": 2081.864282968089,
        "min": 45.627450980392155,
        "max": 45.627450980392155,
        "mean": 45.627450980392155,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 45.627450980392155,
        "sum_squared": 2081.864282968089,
        "min": 45.627450980392155,
        "max": 45.627450980392155,
        "mean": 45.627450980392155,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.8622959384731218,
        "sum_squared": 0.7435542855072419,
        "min": 0.8622959384731218,
        "max": 0.8622959384731218,
        "mean": 0.8622959384731218,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": -6.3912928464541485,
        "sum_squared": 40.84862424913597,
        "min": -6.3912928464541485,
        "max": -6.3912928464541485,
        "mean": -6.3912928464541485,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 45.627450980392155,
        "sum_squared": 2081.864282968089,
        "min": 45.627450980392155,
        "max": 45.627450980392155,
        "mean": 45.627450980392155,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 177.99019607843138,
        "sum_squared": 31680.50990003845,
        "min": 177.99019607843138,
        "max": 177.99019607843138,
        "mean": 177.99019607843138,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.150360769749323,
        "sum_squared": 1.3233299005782548,
        "min": 1.150360769749323,
        "max": 1.150360769749323,
        "mean": 1.150360769749323,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.05180446281651049,
        "sum_squared": 0.0026837023677072176,
        "min": 0.05180446281651049,
        "max": 0.05180446281651049,
        "mean": 0.05180446281651049,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": -0.035908117341686766,
        "sum_squared": 0.0012893928910243458,
        "min": -0.035908117341686766,
        "max": -0.035908117341686766,
        "mean": -0.035908117341686766,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1205.7254901960785,
        "sum_squared": 1453773.9577085737,
        "min": 1205.7254901960785,
        "max": 1205.7254901960785,
        "mean": 1205.7254901960785,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1205.7254901960785,
        "sum_squared": 1453773.9577085737,
        "min": 1205.7254901960785,
        "max": 1205.7254901960785,
        "mean": 1205.7254901960785,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 45.627450980392155,
        "sum_squared": 2081.864282968089,
        "min": 45.627450980392155,
        "max": 45.627450980392155,
        "mean": 45.627450980392155,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 45.627450980392155,
        "sum_squared": 2081.864282968089,
        "min": 45.627450980392155,
        "max": 45.627450980392155,
        "mean": 45.627450980392155,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 45.627450980392155,
        "sum_squared": 2081.864282968089,
        "min": 45.627450980392155,
        "max": 45.627450980392155,
        "mean": 45.627450980392155,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 45.627450980392155,
        "sum_squared": 2081.864282968089,
        "min": 45.627450980392155,
        "max": 45.627450980392155,
        "mean": 45.627450980392155,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8622959384731218,
        "sum_squared": 0.7435542855072419,
        "min": 0.8622959384731218,
        "max": 0.8622959384731218,
        "mean": 0.8622959384731218,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.8622959384731218,
        "sum_squared": 0.7435542855072419,
        "min": 0.8622959384731218,
        "max": 0.8622959384731218,
        "mean": 0.8622959384731218,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -6.3912928464541485,
        "sum_squared": 40.84862424913597,
        "min": -6.3912928464541485,
        "max": -6.3912928464541485,
        "mean": -6.3912928464541485,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -6.3912928464541485,
        "sum_squared": 40.84862424913597,
        "min": -6.3912928464541485,
        "max": -6.3912928464541485,
        "mean": -6.3912928464541485,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 45.627450980392155,
        "sum_squared": 2081.864282968089,
        "min": 45.627450980392155,
        "max": 45.627450980392155,
        "mean": 45.627450980392155,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 45.627450980392155,
        "sum_squared": 2081.864282968089,
        "min": 45.627450980392155,
        "max": 45.627450980392155,
        "mean": 45.627450980392155,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 177.99019607843138,
        "sum_squared": 31680.50990003845,
        "min": 177.99019607843138,
        "max": 177.99019607843138,
        "mean": 177.99019607843138,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 177.99019607843138,
        "sum_squared": 31680.50990003845,
        "min": 177.99019607843138,
        "max": 177.99019607843138,
        "mean": 177.99019607843138,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 102.0,
        "sum_squared": 10404.0,
        "min": 102.0,
        "max": 102.0,
        "mean": 102.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test"
        },
        "count": 1,
        "sum": 0.5980392156862745,
        "sum_squared": 0.35765090349865436,
        "min": 0.5980392156862745,
        "max": 0.5980392156862745,
        "mean": 0.5980392156862745,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5980392156862745,
        "sum_squared": 0.35765090349865436,
        "min": 0.5980392156862745,
        "max": 0.5980392156862745,
        "mean": 0.5980392156862745,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.5980392156862745,
        "sum_squared": 0.35765090349865436,
        "min": 0.5980392156862745,
        "max": 0.5980392156862745,
        "mean": 0.5980392156862745,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.4.0/czech_bank_qa:model=mistralai_mixtral-8x7b-instruct-v0.1",
    "run_spec": {
      "name": "czech_bank_qa:model=mistralai_mixtral-8x7b-instruct-v0.1",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.czech_bank_qa_scenario.CzechBankQAScenario",
        "args": {
          "config_name": "berka_queries_1024_2024_12_18"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Given a SQLite database schema and the following instructions, generate a SQLite query that corresponds to the instructions. Answer with only the query.\n\nDatabase schema:\nCREATE TABLE \"account\" (\n  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"frequency\" varchar(18) NOT NULL\n,  \"date\" date NOT NULL\n,  PRIMARY KEY (\"account_id\")\n,  CONSTRAINT \"account_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"card\" (\n  \"card_id\" integer NOT NULL DEFAULT '0'\n,  \"disp_id\" integer NOT NULL\n,  \"type\" varchar(7) NOT NULL\n,  \"issued\" date NOT NULL\n,  PRIMARY KEY (\"card_id\")\n,  CONSTRAINT \"card_ibfk_1\" FOREIGN KEY (\"disp_id\") REFERENCES \"disp\" (\"disp_id\")\n);\nCREATE TABLE \"client\" (\n  \"client_id\" integer NOT NULL\n,  \"gender\" varchar(1) NOT NULL\n,  \"birth_date\" date NOT NULL\n,  \"district_id\" integer NOT NULL\n,  PRIMARY KEY (\"client_id\")\n,  CONSTRAINT \"client_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"disp\" (\n  \"disp_id\" integer NOT NULL\n,  \"client_id\" integer NOT NULL\n,  \"account_id\" integer NOT NULL\n,  \"type\" varchar(9) NOT NULL\n,  PRIMARY KEY (\"disp_id\")\n,  CONSTRAINT \"disp_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n,  CONSTRAINT \"disp_ibfk_2\" FOREIGN KEY (\"client_id\") REFERENCES \"client\" (\"client_id\")\n);\nCREATE TABLE \"district\" (\n  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"A2\" varchar(19) NOT NULL\n,  \"A3\" varchar(15) NOT NULL\n,  \"A4\" integer NOT NULL\n,  \"A5\" integer NOT NULL\n,  \"A6\" integer NOT NULL\n,  \"A7\" integer NOT NULL\n,  \"A8\" integer NOT NULL\n,  \"A9\" integer NOT NULL\n,  \"A10\" decimal(4,1) NOT NULL\n,  \"A11\" integer NOT NULL\n,  \"A12\" decimal(4,1) DEFAULT NULL\n,  \"A13\" decimal(3,2) NOT NULL\n,  \"A14\" integer NOT NULL\n,  \"A15\" integer DEFAULT NULL\n,  \"A16\" integer NOT NULL\n,  PRIMARY KEY (\"district_id\")\n);\nCREATE TABLE \"loan\" (\n  \"loan_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"date\" date NOT NULL\n,  \"amount\" integer NOT NULL\n,  \"duration\" integer NOT NULL\n,  \"payments\" decimal(6,2) NOT NULL\n,  \"status\" varchar(1) NOT NULL\n,  PRIMARY KEY (\"loan_id\")\n,  CONSTRAINT \"loan_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"order\" (\n  \"order_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"bank_to\" varchar(2) NOT NULL\n,  \"account_to\" integer NOT NULL\n,  \"amount\" decimal(6,1) NOT NULL\n,  \"k_symbol\" varchar(8) NOT NULL\n,  PRIMARY KEY (\"order_id\")\n,  CONSTRAINT \"order_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"trans\" (\n  \"trans_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"date\" date NOT NULL\n,  \"type\" varchar(6) NOT NULL\n,  \"operation\" varchar(14) DEFAULT NULL\n,  \"amount\" integer NOT NULL\n,  \"balance\" integer NOT NULL\n,  \"k_symbol\" varchar(11) DEFAULT NULL\n,  \"bank\" varchar(2) DEFAULT NULL\n,  \"account\" integer  DEFAULT NULL\n,  PRIMARY KEY (\"trans_id\")\n,  CONSTRAINT \"trans_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\n",
        "input_prefix": "Instruction: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "SQL Query: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/mixtral-8x7b-instruct-v0.1",
        "model": "mistralai/mixtral-8x7b-instruct-v0.1",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [
          "\n\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.czech_bank_qa_metrics.CzechBankQAMetrics",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "czech_bank_qa"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.czech_bank_qa_annotator.CzechBankQAAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 1205.7254901960785,
        "sum_squared": 1453773.9577085737,
        "min": 1205.7254901960785,
        "max": 1205.7254901960785,
        "mean": 1205.7254901960785,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 68.6470588235294,
        "sum_squared": 4712.418685121106,
        "min": 68.6470588235294,
        "max": 68.6470588235294,
        "mean": 68.6470588235294,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 68.6470588235294,
        "sum_squared": 4712.418685121106,
        "min": 68.6470588235294,
        "max": 68.6470588235294,
        "mean": 68.6470588235294,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 0.7946161055097393,
        "sum_squared": 0.6314147551354651,
        "min": 0.7946161055097393,
        "max": 0.7946161055097393,
        "mean": 0.7946161055097393,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": -1.4173527080221897,
        "sum_squared": 2.0088886989378345,
        "min": -1.4173527080221897,
        "max": -1.4173527080221897,
        "mean": -1.4173527080221897,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 68.6470588235294,
        "sum_squared": 4712.418685121106,
        "min": 68.6470588235294,
        "max": 68.6470588235294,
        "mean": 68.6470588235294,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 194.5,
        "sum_squared": 37830.25,
        "min": 194.5,
        "max": 194.5,
        "mean": 194.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0208615775413281,
        "sum_squared": 1.042158360500169,
        "min": 1.0208615775413281,
        "max": 1.0208615775413281,
        "mean": 1.0208615775413281,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.010513150247065069,
        "sum_squared": 0.00011052632811736431,
        "min": 0.010513150247065069,
        "max": 0.010513150247065069,
        "mean": 0.010513150247065069,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": -0.007287160452556245,
        "sum_squared": 5.310270746129974e-05,
        "min": -0.007287160452556245,
        "max": -0.007287160452556245,
        "mean": -0.007287160452556245,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1205.7254901960785,
        "sum_squared": 1453773.9577085737,
        "min": 1205.7254901960785,
        "max": 1205.7254901960785,
        "mean": 1205.7254901960785,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1205.7254901960785,
        "sum_squared": 1453773.9577085737,
        "min": 1205.7254901960785,
        "max": 1205.7254901960785,
        "mean": 1205.7254901960785,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 68.6470588235294,
        "sum_squared": 4712.418685121106,
        "min": 68.6470588235294,
        "max": 68.6470588235294,
        "mean": 68.6470588235294,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 68.6470588235294,
        "sum_squared": 4712.418685121106,
        "min": 68.6470588235294,
        "max": 68.6470588235294,
        "mean": 68.6470588235294,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 68.6470588235294,
        "sum_squared": 4712.418685121106,
        "min": 68.6470588235294,
        "max": 68.6470588235294,
        "mean": 68.6470588235294,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 68.6470588235294,
        "sum_squared": 4712.418685121106,
        "min": 68.6470588235294,
        "max": 68.6470588235294,
        "mean": 68.6470588235294,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7946161055097393,
        "sum_squared": 0.6314147551354651,
        "min": 0.7946161055097393,
        "max": 0.7946161055097393,
        "mean": 0.7946161055097393,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.7946161055097393,
        "sum_squared": 0.6314147551354651,
        "min": 0.7946161055097393,
        "max": 0.7946161055097393,
        "mean": 0.7946161055097393,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -1.4173527080221897,
        "sum_squared": 2.0088886989378345,
        "min": -1.4173527080221897,
        "max": -1.4173527080221897,
        "mean": -1.4173527080221897,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -1.4173527080221897,
        "sum_squared": 2.0088886989378345,
        "min": -1.4173527080221897,
        "max": -1.4173527080221897,
        "mean": -1.4173527080221897,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 68.6470588235294,
        "sum_squared": 4712.418685121106,
        "min": 68.6470588235294,
        "max": 68.6470588235294,
        "mean": 68.6470588235294,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 68.6470588235294,
        "sum_squared": 4712.418685121106,
        "min": 68.6470588235294,
        "max": 68.6470588235294,
        "mean": 68.6470588235294,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 194.5,
        "sum_squared": 37830.25,
        "min": 194.5,
        "max": 194.5,
        "mean": 194.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 194.5,
        "sum_squared": 37830.25,
        "min": 194.5,
        "max": 194.5,
        "mean": 194.5,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 102.0,
        "sum_squared": 10404.0,
        "min": 102.0,
        "max": 102.0,
        "mean": 102.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test"
        },
        "count": 1,
        "sum": 0.46078431372549017,
        "sum_squared": 0.21232218377547094,
        "min": 0.46078431372549017,
        "max": 0.46078431372549017,
        "mean": 0.46078431372549017,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.46078431372549017,
        "sum_squared": 0.21232218377547094,
        "min": 0.46078431372549017,
        "max": 0.46078431372549017,
        "mean": 0.46078431372549017,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.46078431372549017,
        "sum_squared": 0.21232218377547094,
        "min": 0.46078431372549017,
        "max": 0.46078431372549017,
        "mean": 0.46078431372549017,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.4.0/czech_bank_qa:model=openai_gpt-4o-2024-08-06",
    "run_spec": {
      "name": "czech_bank_qa:model=openai_gpt-4o-2024-08-06",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.czech_bank_qa_scenario.CzechBankQAScenario",
        "args": {
          "config_name": "berka_queries_1024_2024_12_18"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Given a SQLite database schema and the following instructions, generate a SQLite query that corresponds to the instructions. Answer with only the query.\n\nDatabase schema:\nCREATE TABLE \"account\" (\n  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"frequency\" varchar(18) NOT NULL\n,  \"date\" date NOT NULL\n,  PRIMARY KEY (\"account_id\")\n,  CONSTRAINT \"account_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"card\" (\n  \"card_id\" integer NOT NULL DEFAULT '0'\n,  \"disp_id\" integer NOT NULL\n,  \"type\" varchar(7) NOT NULL\n,  \"issued\" date NOT NULL\n,  PRIMARY KEY (\"card_id\")\n,  CONSTRAINT \"card_ibfk_1\" FOREIGN KEY (\"disp_id\") REFERENCES \"disp\" (\"disp_id\")\n);\nCREATE TABLE \"client\" (\n  \"client_id\" integer NOT NULL\n,  \"gender\" varchar(1) NOT NULL\n,  \"birth_date\" date NOT NULL\n,  \"district_id\" integer NOT NULL\n,  PRIMARY KEY (\"client_id\")\n,  CONSTRAINT \"client_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"disp\" (\n  \"disp_id\" integer NOT NULL\n,  \"client_id\" integer NOT NULL\n,  \"account_id\" integer NOT NULL\n,  \"type\" varchar(9) NOT NULL\n,  PRIMARY KEY (\"disp_id\")\n,  CONSTRAINT \"disp_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n,  CONSTRAINT \"disp_ibfk_2\" FOREIGN KEY (\"client_id\") REFERENCES \"client\" (\"client_id\")\n);\nCREATE TABLE \"district\" (\n  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"A2\" varchar(19) NOT NULL\n,  \"A3\" varchar(15) NOT NULL\n,  \"A4\" integer NOT NULL\n,  \"A5\" integer NOT NULL\n,  \"A6\" integer NOT NULL\n,  \"A7\" integer NOT NULL\n,  \"A8\" integer NOT NULL\n,  \"A9\" integer NOT NULL\n,  \"A10\" decimal(4,1) NOT NULL\n,  \"A11\" integer NOT NULL\n,  \"A12\" decimal(4,1) DEFAULT NULL\n,  \"A13\" decimal(3,2) NOT NULL\n,  \"A14\" integer NOT NULL\n,  \"A15\" integer DEFAULT NULL\n,  \"A16\" integer NOT NULL\n,  PRIMARY KEY (\"district_id\")\n);\nCREATE TABLE \"loan\" (\n  \"loan_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"date\" date NOT NULL\n,  \"amount\" integer NOT NULL\n,  \"duration\" integer NOT NULL\n,  \"payments\" decimal(6,2) NOT NULL\n,  \"status\" varchar(1) NOT NULL\n,  PRIMARY KEY (\"loan_id\")\n,  CONSTRAINT \"loan_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"order\" (\n  \"order_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"bank_to\" varchar(2) NOT NULL\n,  \"account_to\" integer NOT NULL\n,  \"amount\" decimal(6,1) NOT NULL\n,  \"k_symbol\" varchar(8) NOT NULL\n,  PRIMARY KEY (\"order_id\")\n,  CONSTRAINT \"order_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"trans\" (\n  \"trans_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"date\" date NOT NULL\n,  \"type\" varchar(6) NOT NULL\n,  \"operation\" varchar(14) DEFAULT NULL\n,  \"amount\" integer NOT NULL\n,  \"balance\" integer NOT NULL\n,  \"k_symbol\" varchar(11) DEFAULT NULL\n,  \"bank\" varchar(2) DEFAULT NULL\n,  \"account\" integer  DEFAULT NULL\n,  PRIMARY KEY (\"trans_id\")\n,  CONSTRAINT \"trans_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\n",
        "input_prefix": "Instruction: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "SQL Query: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-4o-2024-08-06",
        "model": "openai/gpt-4o-2024-08-06",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [
          "\n\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.czech_bank_qa_metrics.CzechBankQAMetrics",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "czech_bank_qa"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.czech_bank_qa_annotator.CzechBankQAAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 981.8137254901961,
        "sum_squared": 963958.1915609383,
        "min": 981.8137254901961,
        "max": 981.8137254901961,
        "mean": 981.8137254901961,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 54.470588235294116,
        "sum_squared": 2967.0449826989616,
        "min": 54.470588235294116,
        "max": 54.470588235294116,
        "mean": 54.470588235294116,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 54.470588235294116,
        "sum_squared": 2967.0449826989616,
        "min": 54.470588235294116,
        "max": 54.470588235294116,
        "mean": 54.470588235294116,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.5448027592079312,
        "sum_squared": 2.3864155648564376,
        "min": 1.5448027592079312,
        "max": 1.5448027592079312,
        "mean": 1.5448027592079312,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 54.470588235294116,
        "sum_squared": 2967.0449826989616,
        "min": 54.470588235294116,
        "max": 54.470588235294116,
        "mean": 54.470588235294116,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 193.2843137254902,
        "sum_squared": 37358.82593233372,
        "min": 193.2843137254902,
        "max": 193.2843137254902,
        "mean": 193.2843137254902,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 981.8137254901961,
        "sum_squared": 963958.1915609383,
        "min": 981.8137254901961,
        "max": 981.8137254901961,
        "mean": 981.8137254901961,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 981.8137254901961,
        "sum_squared": 963958.1915609383,
        "min": 981.8137254901961,
        "max": 981.8137254901961,
        "mean": 981.8137254901961,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 54.470588235294116,
        "sum_squared": 2967.0449826989616,
        "min": 54.470588235294116,
        "max": 54.470588235294116,
        "mean": 54.470588235294116,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 54.470588235294116,
        "sum_squared": 2967.0449826989616,
        "min": 54.470588235294116,
        "max": 54.470588235294116,
        "mean": 54.470588235294116,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 54.470588235294116,
        "sum_squared": 2967.0449826989616,
        "min": 54.470588235294116,
        "max": 54.470588235294116,
        "mean": 54.470588235294116,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 54.470588235294116,
        "sum_squared": 2967.0449826989616,
        "min": 54.470588235294116,
        "max": 54.470588235294116,
        "mean": 54.470588235294116,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.5448027592079312,
        "sum_squared": 2.3864155648564376,
        "min": 1.5448027592079312,
        "max": 1.5448027592079312,
        "mean": 1.5448027592079312,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.5448027592079312,
        "sum_squared": 2.3864155648564376,
        "min": 1.5448027592079312,
        "max": 1.5448027592079312,
        "mean": 1.5448027592079312,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 54.470588235294116,
        "sum_squared": 2967.0449826989616,
        "min": 54.470588235294116,
        "max": 54.470588235294116,
        "mean": 54.470588235294116,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 54.470588235294116,
        "sum_squared": 2967.0449826989616,
        "min": 54.470588235294116,
        "max": 54.470588235294116,
        "mean": 54.470588235294116,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 193.2843137254902,
        "sum_squared": 37358.82593233372,
        "min": 193.2843137254902,
        "max": 193.2843137254902,
        "mean": 193.2843137254902,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 193.2843137254902,
        "sum_squared": 37358.82593233372,
        "min": 193.2843137254902,
        "max": 193.2843137254902,
        "mean": 193.2843137254902,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 102.0,
        "sum_squared": 10404.0,
        "min": 102.0,
        "max": 102.0,
        "mean": 102.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0196078431372549,
        "sum_squared": 0.00038446751249519417,
        "min": 0.0196078431372549,
        "max": 0.0196078431372549,
        "mean": 0.0196078431372549,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0196078431372549,
        "sum_squared": 0.00038446751249519417,
        "min": 0.0196078431372549,
        "max": 0.0196078431372549,
        "mean": 0.0196078431372549,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0196078431372549,
        "sum_squared": 0.00038446751249519417,
        "min": 0.0196078431372549,
        "max": 0.0196078431372549,
        "mean": 0.0196078431372549,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.4.0/czech_bank_qa:model=openai_gpt-4o-mini-2024-07-18",
    "run_spec": {
      "name": "czech_bank_qa:model=openai_gpt-4o-mini-2024-07-18",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.czech_bank_qa_scenario.CzechBankQAScenario",
        "args": {
          "config_name": "berka_queries_1024_2024_12_18"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Given a SQLite database schema and the following instructions, generate a SQLite query that corresponds to the instructions. Answer with only the query.\n\nDatabase schema:\nCREATE TABLE \"account\" (\n  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"frequency\" varchar(18) NOT NULL\n,  \"date\" date NOT NULL\n,  PRIMARY KEY (\"account_id\")\n,  CONSTRAINT \"account_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"card\" (\n  \"card_id\" integer NOT NULL DEFAULT '0'\n,  \"disp_id\" integer NOT NULL\n,  \"type\" varchar(7) NOT NULL\n,  \"issued\" date NOT NULL\n,  PRIMARY KEY (\"card_id\")\n,  CONSTRAINT \"card_ibfk_1\" FOREIGN KEY (\"disp_id\") REFERENCES \"disp\" (\"disp_id\")\n);\nCREATE TABLE \"client\" (\n  \"client_id\" integer NOT NULL\n,  \"gender\" varchar(1) NOT NULL\n,  \"birth_date\" date NOT NULL\n,  \"district_id\" integer NOT NULL\n,  PRIMARY KEY (\"client_id\")\n,  CONSTRAINT \"client_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"disp\" (\n  \"disp_id\" integer NOT NULL\n,  \"client_id\" integer NOT NULL\n,  \"account_id\" integer NOT NULL\n,  \"type\" varchar(9) NOT NULL\n,  PRIMARY KEY (\"disp_id\")\n,  CONSTRAINT \"disp_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n,  CONSTRAINT \"disp_ibfk_2\" FOREIGN KEY (\"client_id\") REFERENCES \"client\" (\"client_id\")\n);\nCREATE TABLE \"district\" (\n  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"A2\" varchar(19) NOT NULL\n,  \"A3\" varchar(15) NOT NULL\n,  \"A4\" integer NOT NULL\n,  \"A5\" integer NOT NULL\n,  \"A6\" integer NOT NULL\n,  \"A7\" integer NOT NULL\n,  \"A8\" integer NOT NULL\n,  \"A9\" integer NOT NULL\n,  \"A10\" decimal(4,1) NOT NULL\n,  \"A11\" integer NOT NULL\n,  \"A12\" decimal(4,1) DEFAULT NULL\n,  \"A13\" decimal(3,2) NOT NULL\n,  \"A14\" integer NOT NULL\n,  \"A15\" integer DEFAULT NULL\n,  \"A16\" integer NOT NULL\n,  PRIMARY KEY (\"district_id\")\n);\nCREATE TABLE \"loan\" (\n  \"loan_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"date\" date NOT NULL\n,  \"amount\" integer NOT NULL\n,  \"duration\" integer NOT NULL\n,  \"payments\" decimal(6,2) NOT NULL\n,  \"status\" varchar(1) NOT NULL\n,  PRIMARY KEY (\"loan_id\")\n,  CONSTRAINT \"loan_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"order\" (\n  \"order_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"bank_to\" varchar(2) NOT NULL\n,  \"account_to\" integer NOT NULL\n,  \"amount\" decimal(6,1) NOT NULL\n,  \"k_symbol\" varchar(8) NOT NULL\n,  PRIMARY KEY (\"order_id\")\n,  CONSTRAINT \"order_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"trans\" (\n  \"trans_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"date\" date NOT NULL\n,  \"type\" varchar(6) NOT NULL\n,  \"operation\" varchar(14) DEFAULT NULL\n,  \"amount\" integer NOT NULL\n,  \"balance\" integer NOT NULL\n,  \"k_symbol\" varchar(11) DEFAULT NULL\n,  \"bank\" varchar(2) DEFAULT NULL\n,  \"account\" integer  DEFAULT NULL\n,  PRIMARY KEY (\"trans_id\")\n,  CONSTRAINT \"trans_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\n",
        "input_prefix": "Instruction: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "SQL Query: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "openai/gpt-4o-mini-2024-07-18",
        "model": "openai/gpt-4o-mini-2024-07-18",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [
          "\n\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.czech_bank_qa_metrics.CzechBankQAMetrics",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "czech_bank_qa"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.czech_bank_qa_annotator.CzechBankQAAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 981.8137254901961,
        "sum_squared": 963958.1915609383,
        "min": 981.8137254901961,
        "max": 981.8137254901961,
        "mean": 981.8137254901961,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 49.549019607843135,
        "sum_squared": 2455.1053440984233,
        "min": 49.549019607843135,
        "max": 49.549019607843135,
        "mean": 49.549019607843135,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 49.549019607843135,
        "sum_squared": 2455.1053440984233,
        "min": 49.549019607843135,
        "max": 49.549019607843135,
        "mean": 49.549019607843135,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.1134791911817064,
        "sum_squared": 1.239835909194667,
        "min": 1.1134791911817064,
        "max": 1.1134791911817064,
        "mean": 1.1134791911817064,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 49.549019607843135,
        "sum_squared": 2455.1053440984233,
        "min": 49.549019607843135,
        "max": 49.549019607843135,
        "mean": 49.549019607843135,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 178.0392156862745,
        "sum_squared": 31697.962322183772,
        "min": 178.0392156862745,
        "max": 178.0392156862745,
        "mean": 178.0392156862745,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 981.8137254901961,
        "sum_squared": 963958.1915609383,
        "min": 981.8137254901961,
        "max": 981.8137254901961,
        "mean": 981.8137254901961,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 981.8137254901961,
        "sum_squared": 963958.1915609383,
        "min": 981.8137254901961,
        "max": 981.8137254901961,
        "mean": 981.8137254901961,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 49.549019607843135,
        "sum_squared": 2455.1053440984233,
        "min": 49.549019607843135,
        "max": 49.549019607843135,
        "mean": 49.549019607843135,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 49.549019607843135,
        "sum_squared": 2455.1053440984233,
        "min": 49.549019607843135,
        "max": 49.549019607843135,
        "mean": 49.549019607843135,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 49.549019607843135,
        "sum_squared": 2455.1053440984233,
        "min": 49.549019607843135,
        "max": 49.549019607843135,
        "mean": 49.549019607843135,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 49.549019607843135,
        "sum_squared": 2455.1053440984233,
        "min": 49.549019607843135,
        "max": 49.549019607843135,
        "mean": 49.549019607843135,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.1134791911817064,
        "sum_squared": 1.239835909194667,
        "min": 1.1134791911817064,
        "max": 1.1134791911817064,
        "mean": 1.1134791911817064,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.1134791911817064,
        "sum_squared": 1.239835909194667,
        "min": 1.1134791911817064,
        "max": 1.1134791911817064,
        "mean": 1.1134791911817064,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 49.549019607843135,
        "sum_squared": 2455.1053440984233,
        "min": 49.549019607843135,
        "max": 49.549019607843135,
        "mean": 49.549019607843135,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 49.549019607843135,
        "sum_squared": 2455.1053440984233,
        "min": 49.549019607843135,
        "max": 49.549019607843135,
        "mean": 49.549019607843135,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 178.0392156862745,
        "sum_squared": 31697.962322183772,
        "min": 178.0392156862745,
        "max": 178.0392156862745,
        "mean": 178.0392156862745,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 178.0392156862745,
        "sum_squared": 31697.962322183772,
        "min": 178.0392156862745,
        "max": 178.0392156862745,
        "mean": 178.0392156862745,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 102.0,
        "sum_squared": 10404.0,
        "min": 102.0,
        "max": 102.0,
        "mean": 102.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test"
        },
        "count": 1,
        "sum": 0.058823529411764705,
        "sum_squared": 0.0034602076124567475,
        "min": 0.058823529411764705,
        "max": 0.058823529411764705,
        "mean": 0.058823529411764705,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.058823529411764705,
        "sum_squared": 0.0034602076124567475,
        "min": 0.058823529411764705,
        "max": 0.058823529411764705,
        "mean": 0.058823529411764705,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.058823529411764705,
        "sum_squared": 0.0034602076124567475,
        "min": 0.058823529411764705,
        "max": 0.058823529411764705,
        "mean": 0.058823529411764705,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  },
  {
    "run_path": "benchmark_output/runs/v0.4.0/czech_bank_qa:model=qwen_qwen2-72b-instruct",
    "run_spec": {
      "name": "czech_bank_qa:model=qwen_qwen2-72b-instruct",
      "scenario_spec": {
        "class_name": "helm.benchmark.scenarios.czech_bank_qa_scenario.CzechBankQAScenario",
        "args": {
          "config_name": "berka_queries_1024_2024_12_18"
        }
      },
      "adapter_spec": {
        "method": "generation",
        "global_prefix": "",
        "global_suffix": "",
        "instructions": "Given a SQLite database schema and the following instructions, generate a SQLite query that corresponds to the instructions. Answer with only the query.\n\nDatabase schema:\nCREATE TABLE \"account\" (\n  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"frequency\" varchar(18) NOT NULL\n,  \"date\" date NOT NULL\n,  PRIMARY KEY (\"account_id\")\n,  CONSTRAINT \"account_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"card\" (\n  \"card_id\" integer NOT NULL DEFAULT '0'\n,  \"disp_id\" integer NOT NULL\n,  \"type\" varchar(7) NOT NULL\n,  \"issued\" date NOT NULL\n,  PRIMARY KEY (\"card_id\")\n,  CONSTRAINT \"card_ibfk_1\" FOREIGN KEY (\"disp_id\") REFERENCES \"disp\" (\"disp_id\")\n);\nCREATE TABLE \"client\" (\n  \"client_id\" integer NOT NULL\n,  \"gender\" varchar(1) NOT NULL\n,  \"birth_date\" date NOT NULL\n,  \"district_id\" integer NOT NULL\n,  PRIMARY KEY (\"client_id\")\n,  CONSTRAINT \"client_ibfk_1\" FOREIGN KEY (\"district_id\") REFERENCES \"district\" (\"district_id\")\n);\nCREATE TABLE \"disp\" (\n  \"disp_id\" integer NOT NULL\n,  \"client_id\" integer NOT NULL\n,  \"account_id\" integer NOT NULL\n,  \"type\" varchar(9) NOT NULL\n,  PRIMARY KEY (\"disp_id\")\n,  CONSTRAINT \"disp_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n,  CONSTRAINT \"disp_ibfk_2\" FOREIGN KEY (\"client_id\") REFERENCES \"client\" (\"client_id\")\n);\nCREATE TABLE \"district\" (\n  \"district_id\" integer NOT NULL DEFAULT '0'\n,  \"A2\" varchar(19) NOT NULL\n,  \"A3\" varchar(15) NOT NULL\n,  \"A4\" integer NOT NULL\n,  \"A5\" integer NOT NULL\n,  \"A6\" integer NOT NULL\n,  \"A7\" integer NOT NULL\n,  \"A8\" integer NOT NULL\n,  \"A9\" integer NOT NULL\n,  \"A10\" decimal(4,1) NOT NULL\n,  \"A11\" integer NOT NULL\n,  \"A12\" decimal(4,1) DEFAULT NULL\n,  \"A13\" decimal(3,2) NOT NULL\n,  \"A14\" integer NOT NULL\n,  \"A15\" integer DEFAULT NULL\n,  \"A16\" integer NOT NULL\n,  PRIMARY KEY (\"district_id\")\n);\nCREATE TABLE \"loan\" (\n  \"loan_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"date\" date NOT NULL\n,  \"amount\" integer NOT NULL\n,  \"duration\" integer NOT NULL\n,  \"payments\" decimal(6,2) NOT NULL\n,  \"status\" varchar(1) NOT NULL\n,  PRIMARY KEY (\"loan_id\")\n,  CONSTRAINT \"loan_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"order\" (\n  \"order_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL\n,  \"bank_to\" varchar(2) NOT NULL\n,  \"account_to\" integer NOT NULL\n,  \"amount\" decimal(6,1) NOT NULL\n,  \"k_symbol\" varchar(8) NOT NULL\n,  PRIMARY KEY (\"order_id\")\n,  CONSTRAINT \"order_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\nCREATE TABLE \"trans\" (\n  \"trans_id\" integer NOT NULL DEFAULT '0'\n,  \"account_id\" integer NOT NULL DEFAULT '0'\n,  \"date\" date NOT NULL\n,  \"type\" varchar(6) NOT NULL\n,  \"operation\" varchar(14) DEFAULT NULL\n,  \"amount\" integer NOT NULL\n,  \"balance\" integer NOT NULL\n,  \"k_symbol\" varchar(11) DEFAULT NULL\n,  \"bank\" varchar(2) DEFAULT NULL\n,  \"account\" integer  DEFAULT NULL\n,  PRIMARY KEY (\"trans_id\")\n,  CONSTRAINT \"trans_ibfk_1\" FOREIGN KEY (\"account_id\") REFERENCES \"account\" (\"account_id\")\n);\n",
        "input_prefix": "Instruction: ",
        "input_suffix": "\n",
        "reference_prefix": "A. ",
        "reference_suffix": "\n",
        "chain_of_thought_prefix": "",
        "chain_of_thought_suffix": "\n",
        "output_prefix": "SQL Query: ",
        "output_suffix": "\n",
        "instance_prefix": "\n",
        "substitutions": [],
        "max_train_instances": 5,
        "max_eval_instances": 1000,
        "num_outputs": 1,
        "num_train_trials": 1,
        "num_trials": 1,
        "sample_train": true,
        "model_deployment": "together/qwen2-72b-instruct",
        "model": "qwen/qwen2-72b-instruct",
        "temperature": 0.0,
        "max_tokens": 512,
        "stop_sequences": [
          "\n\n"
        ],
        "multi_label": false
      },
      "metric_specs": [
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicGenerationMetric",
          "args": {
            "names": []
          }
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.BasicReferenceMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.basic_metrics.InstancesPerSplitMetric",
          "args": {}
        },
        {
          "class_name": "helm.benchmark.metrics.czech_bank_qa_metrics.CzechBankQAMetrics",
          "args": {}
        }
      ],
      "data_augmenter_spec": {
        "perturbation_specs": [],
        "should_augment_train_instances": false,
        "should_include_original_train": false,
        "should_skip_unchanged_train": false,
        "should_augment_eval_instances": false,
        "should_include_original_eval": false,
        "should_skip_unchanged_eval": false,
        "seeds_per_instance": 1
      },
      "groups": [
        "czech_bank_qa"
      ],
      "annotators": [
        {
          "class_name": "helm.benchmark.annotation.czech_bank_qa_annotator.CzechBankQAAnnotator",
          "args": {}
        }
      ]
    },
    "stats": [
      {
        "name": {
          "name": "num_references",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 986.3529411764706,
        "sum_squared": 972892.1245674741,
        "min": 986.3529411764706,
        "max": 986.3529411764706,
        "mean": 986.3529411764706,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 52.90196078431372,
        "sum_squared": 2798.617454825067,
        "min": 52.90196078431372,
        "max": 52.90196078431372,
        "mean": 52.90196078431372,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 52.90196078431372,
        "sum_squared": 2798.617454825067,
        "min": 52.90196078431372,
        "max": 52.90196078431372,
        "mean": 52.90196078431372,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "training_co2_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "training_energy_cost",
          "split": "test"
        },
        "count": 0,
        "sum": 0.0,
        "sum_squared": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test"
        },
        "count": 1,
        "sum": 1.1277981949787514,
        "sum_squared": 1.27192876859733,
        "min": 1.1277981949787514,
        "max": 1.1277981949787514,
        "mean": 1.1277981949787514,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test"
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test"
        },
        "count": 1,
        "sum": -1.9281269539680632,
        "sum_squared": 3.7176735506181617,
        "min": -1.9281269539680632,
        "max": -1.9281269539680632,
        "mean": -1.9281269539680632,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test"
        },
        "count": 1,
        "sum": 52.90196078431372,
        "sum_squared": 2798.617454825067,
        "min": 52.90196078431372,
        "max": 52.90196078431372,
        "mean": 52.90196078431372,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test"
        },
        "count": 1,
        "sum": 186.6764705882353,
        "sum_squared": 34848.10467128028,
        "min": 186.6764705882353,
        "max": 186.6764705882353,
        "mean": 186.6764705882353,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "perplexity",
          "split": "test"
        },
        "count": 1,
        "sum": 1.0371195153877226,
        "sum_squared": 1.0756168891980646,
        "min": 1.0371195153877226,
        "max": 1.0371195153877226,
        "mean": 1.0371195153877226,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "bits_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": 0.014901177346714714,
        "sum_squared": 0.00022204508631824376,
        "min": 0.014901177346714714,
        "max": 0.014901177346714714,
        "mean": 0.014901177346714714,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob_per_byte",
          "split": "test"
        },
        "count": 1,
        "sum": -0.01032870906489903,
        "sum_squared": 0.00010668223094732742,
        "min": -0.01032870906489903,
        "max": -0.01032870906489903,
        "mean": -0.01032870906489903,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_references",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_trials",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 986.3529411764706,
        "sum_squared": 972892.1245674741,
        "min": 986.3529411764706,
        "max": 986.3529411764706,
        "mean": 986.3529411764706,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_prompt_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 986.3529411764706,
        "sum_squared": 972892.1245674741,
        "min": 986.3529411764706,
        "max": 986.3529411764706,
        "mean": 986.3529411764706,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 52.90196078431372,
        "sum_squared": 2798.617454825067,
        "min": 52.90196078431372,
        "max": 52.90196078431372,
        "mean": 52.90196078431372,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_completion_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 52.90196078431372,
        "sum_squared": 2798.617454825067,
        "min": 52.90196078431372,
        "max": 52.90196078431372,
        "mean": 52.90196078431372,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 52.90196078431372,
        "sum_squared": 2798.617454825067,
        "min": 52.90196078431372,
        "max": 52.90196078431372,
        "mean": 52.90196078431372,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_output_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 52.90196078431372,
        "sum_squared": 2798.617454825067,
        "min": 52.90196078431372,
        "max": 52.90196078431372,
        "mean": 52.90196078431372,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.1277981949787514,
        "sum_squared": 1.27192876859733,
        "min": 1.1277981949787514,
        "max": 1.1277981949787514,
        "mean": 1.1277981949787514,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "inference_runtime",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.1277981949787514,
        "sum_squared": 1.27192876859733,
        "min": 1.1277981949787514,
        "max": 1.1277981949787514,
        "mean": 1.1277981949787514,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "batch_size",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_length",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_stop",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_endoftext",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "finish_reason_unknown",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_train_instances",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "prompt_truncated",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.0,
        "sum_squared": 0.0,
        "min": 0.0,
        "max": 0.0,
        "mean": 0.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "max_prob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 1.0,
        "sum_squared": 1.0,
        "min": 1.0,
        "max": 1.0,
        "mean": 1.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -1.9281269539680632,
        "sum_squared": 3.7176735506181617,
        "min": -1.9281269539680632,
        "max": -1.9281269539680632,
        "mean": -1.9281269539680632,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "logprob",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": -1.9281269539680632,
        "sum_squared": 3.7176735506181617,
        "min": -1.9281269539680632,
        "max": -1.9281269539680632,
        "mean": -1.9281269539680632,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 52.90196078431372,
        "sum_squared": 2798.617454825067,
        "min": 52.90196078431372,
        "max": 52.90196078431372,
        "mean": 52.90196078431372,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_perplexity_tokens",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 52.90196078431372,
        "sum_squared": 2798.617454825067,
        "min": 52.90196078431372,
        "max": 52.90196078431372,
        "mean": 52.90196078431372,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 186.6764705882353,
        "sum_squared": 34848.10467128028,
        "min": 186.6764705882353,
        "max": 186.6764705882353,
        "mean": 186.6764705882353,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_bytes",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 186.6764705882353,
        "sum_squared": 34848.10467128028,
        "min": 186.6764705882353,
        "max": 186.6764705882353,
        "mean": 186.6764705882353,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "num_instances",
          "split": "test"
        },
        "count": 1,
        "sum": 102.0,
        "sum_squared": 10404.0,
        "min": 102.0,
        "max": 102.0,
        "mean": 102.0,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test"
        },
        "count": 1,
        "sum": 0.16666666666666666,
        "sum_squared": 0.027777777777777776,
        "min": 0.16666666666666666,
        "max": 0.16666666666666666,
        "mean": 0.16666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "robustness",
            "robustness": true,
            "fairness": false,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.16666666666666666,
        "sum_squared": 0.027777777777777776,
        "min": 0.16666666666666666,
        "max": 0.16666666666666666,
        "mean": 0.16666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      },
      {
        "name": {
          "name": "error_rate",
          "split": "test",
          "perturbation": {
            "name": "fairness",
            "robustness": false,
            "fairness": true,
            "computed_on": "worst"
          }
        },
        "count": 1,
        "sum": 0.16666666666666666,
        "sum_squared": 0.027777777777777776,
        "min": 0.16666666666666666,
        "max": 0.16666666666666666,
        "mean": 0.16666666666666666,
        "variance": 0.0,
        "stddev": 0.0
      }
    ]
  }
]