{
  "title": "Summarization",
  "header": [
    {
      "value": "Model",
      "markdown": false,
      "metadata": {}
    },
    {
      "value": "Helpdesk Call summarization - Score",
      "description": "Helpdesk Call summarization\n\nScore: Score",
      "markdown": false,
      "lower_is_better": false,
      "metadata": {
        "metric": "Score",
        "run_group": "Helpdesk Call summarization"
      }
    },
    {
      "value": "Helpdesk Call summarization - Faithfulness",
      "description": "Helpdesk Call summarization\n\nFaithfulness: Whether all the information expressed by the summary can be inferred from the source transcript.",
      "markdown": false,
      "lower_is_better": false,
      "metadata": {
        "metric": "Faithfulness",
        "run_group": "Helpdesk Call summarization"
      }
    },
    {
      "value": "Helpdesk Call summarization - Relevance",
      "description": "Helpdesk Call summarization\n\nRelevance: Whether the summary includes only important information from the source.",
      "markdown": false,
      "lower_is_better": false,
      "metadata": {
        "metric": "Relevance",
        "run_group": "Helpdesk Call summarization"
      }
    },
    {
      "value": "Helpdesk Call summarization - Coherence",
      "description": "Helpdesk Call summarization\n\nCoherence: Whether the summary organizes the relevant information into a well-structured summary.",
      "markdown": false,
      "lower_is_better": false,
      "metadata": {
        "metric": "Coherence",
        "run_group": "Helpdesk Call summarization"
      }
    }
  ],
  "rows": [
    [
      {
        "value": "DeepSeek v3",
        "description": "",
        "markdown": false
      },
      {
        "value": 0.8161865569272949,
        "description": "min=0.816, mean=0.816, max=0.816, sum=0.816 (1)",
        "style": {
          "font-weight": "bold"
        },
        "markdown": false,
        "run_spec_names": [
          "helpdesk_call_summarization:model=deepseek-ai_deepseek-v3"
        ]
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      }
    ],
    [
      {
        "value": "Llama 3.1 Instruct Turbo (405B)",
        "description": "",
        "markdown": false
      },
      {
        "value": 0.7976680384087768,
        "description": "min=0.798, mean=0.798, max=0.798, sum=0.798 (1)",
        "style": {},
        "markdown": false,
        "run_spec_names": [
          "helpdesk_call_summarization:model=meta_llama-3.1-405b-instruct-turbo"
        ]
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      }
    ],
    [
      {
        "value": "Llama 3.1 Instruct Turbo (70B)",
        "description": "",
        "markdown": false
      },
      {
        "value": 0.7894375857338795,
        "description": "min=0.789, mean=0.789, max=0.789, sum=0.789 (1)",
        "style": {},
        "markdown": false,
        "run_spec_names": [
          "helpdesk_call_summarization:model=meta_llama-3.1-70b-instruct-turbo"
        ]
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      }
    ],
    [
      {
        "value": "Llama 3.1 Instruct Turbo (8B)",
        "description": "",
        "markdown": false
      },
      {
        "value": 0.7661179698216716,
        "description": "min=0.766, mean=0.766, max=0.766, sum=0.766 (1)",
        "style": {},
        "markdown": false,
        "run_spec_names": [
          "helpdesk_call_summarization:model=meta_llama-3.1-8b-instruct-turbo"
        ]
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      }
    ],
    [
      {
        "value": "Mistral Instruct v0.3 (7B)",
        "description": "",
        "markdown": false
      },
      {
        "value": 0.7585733882030161,
        "description": "min=0.759, mean=0.759, max=0.759, sum=0.759 (1)",
        "style": {},
        "markdown": false,
        "run_spec_names": [
          "helpdesk_call_summarization:model=mistralai_mistral-7b-instruct-v0.3"
        ]
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      }
    ],
    [
      {
        "value": "Mixtral Instruct (8x22B)",
        "description": "",
        "markdown": false
      },
      {
        "value": 0.7908093278463623,
        "description": "min=0.791, mean=0.791, max=0.791, sum=0.791 (1)",
        "style": {},
        "markdown": false,
        "run_spec_names": [
          "helpdesk_call_summarization:model=mistralai_mixtral-8x22b-instruct-v0.1"
        ]
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      }
    ],
    [
      {
        "value": "Mixtral Instruct (8x7B)",
        "description": "",
        "markdown": false
      },
      {
        "value": 0.758573388203016,
        "description": "min=0.759, mean=0.759, max=0.759, sum=0.759 (1)",
        "style": {},
        "markdown": false,
        "run_spec_names": [
          "helpdesk_call_summarization:model=mistralai_mixtral-8x7b-instruct-v0.1"
        ]
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      }
    ],
    [
      {
        "value": "Qwen2.5 Instruct Turbo (72B)",
        "description": "",
        "markdown": false
      },
      {
        "value": 0.7969821673525352,
        "description": "min=0.797, mean=0.797, max=0.797, sum=0.797 (1)",
        "style": {},
        "markdown": false,
        "run_spec_names": [
          "helpdesk_call_summarization:model=qwen_qwen2.5-72b-instruct-turbo"
        ]
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      }
    ],
    [
      {
        "value": "Qwen2.5 Instruct Turbo (7B)",
        "description": "",
        "markdown": false
      },
      {
        "value": 0.7469135802469118,
        "description": "min=0.747, mean=0.747, max=0.747, sum=0.747 (1)",
        "style": {},
        "markdown": false,
        "run_spec_names": [
          "helpdesk_call_summarization:model=qwen_qwen2.5-7b-instruct-turbo"
        ]
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      }
    ],
    [
      {
        "value": "Claude 3.5 Haiku (20241022)",
        "description": "",
        "markdown": false
      },
      {
        "value": 0.7969821673525349,
        "description": "min=0.797, mean=0.797, max=0.797, sum=0.797 (1)",
        "style": {},
        "markdown": false,
        "run_spec_names": [
          "helpdesk_call_summarization:model=anthropic_claude-3-5-haiku-20241022"
        ]
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      }
    ],
    [
      {
        "value": "Claude 3.5 Sonnet (20240620)",
        "description": "",
        "markdown": false
      },
      {
        "value": 0.8072702331961568,
        "description": "min=0.807, mean=0.807, max=0.807, sum=0.807 (1)",
        "style": {},
        "markdown": false,
        "run_spec_names": [
          "helpdesk_call_summarization:model=anthropic_claude-3-5-sonnet-20240620"
        ]
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      }
    ],
    [
      {
        "value": "Gemini 1.5 Pro (002)",
        "description": "",
        "markdown": false
      },
      {
        "value": 0.785322359396431,
        "description": "min=0.785, mean=0.785, max=0.785, sum=0.785 (1)",
        "style": {},
        "markdown": false,
        "run_spec_names": [
          "helpdesk_call_summarization:model=google_gemini-1.5-pro-002"
        ]
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      }
    ],
    [
      {
        "value": "Gemini 1.5 Flash (002)",
        "description": "",
        "markdown": false
      },
      {
        "value": 0.7764060356652923,
        "description": "min=0.776, mean=0.776, max=0.776, sum=0.776 (1)",
        "style": {},
        "markdown": false,
        "run_spec_names": [
          "helpdesk_call_summarization:model=google_gemini-1.5-flash-002"
        ]
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      }
    ],
    [
      {
        "value": "GPT-4o (2024-08-06)",
        "description": "",
        "markdown": false
      },
      {
        "value": 0.8134430727023293,
        "description": "min=0.813, mean=0.813, max=0.813, sum=0.813 (1)",
        "style": {},
        "markdown": false,
        "run_spec_names": [
          "helpdesk_call_summarization:model=openai_gpt-4o-2024-08-06"
        ]
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      }
    ],
    [
      {
        "value": "GPT-4o mini (2024-07-18)",
        "description": "",
        "markdown": false
      },
      {
        "value": 0.7921810699588453,
        "description": "min=0.792, mean=0.792, max=0.792, sum=0.792 (1)",
        "style": {},
        "markdown": false,
        "run_spec_names": [
          "helpdesk_call_summarization:model=openai_gpt-4o-mini-2024-07-18"
        ]
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      },
      {
        "description": "1 matching runs, but no matching metrics",
        "markdown": false
      }
    ]
  ],
  "links": [
    {
      "text": "LaTeX",
      "href": "benchmark_output/releases/v0.1.0/groups/latex/call_center_scenarios_summarization_metrics.tex"
    },
    {
      "text": "JSON",
      "href": "benchmark_output/releases/v0.1.0/groups/json/call_center_scenarios_summarization_metrics.json"
    }
  ],
  "name": "summarization_metrics"
}