TemporalBench Leaderboard

This leaderboard presents offline evaluation results for agent configurations on the TemporalBench benchmark. It is a pure visualization and validation layer: no agents are executed here, and no LLM APIs are called.

{
  • "headers": [
    • "agent_name",
    • "agent_type",
    • "base_model",
    • "T1_acc",
    • "T2_acc",
    • "T3_acc",
    • "T4_acc",
    • "overall_mcq_acc",
    • "T2_MAE",
    • "T2_sMAPE",
    • "T4_MAE",
    • "T4_sMAPE",
    • "MIMIC_T2_OW_sMAPE",
    • "MIMIC_T2_OW_RMSSE",
    • "MIMIC_T4_OW_sMAPE",
    • "MIMIC_T4_OW_RMSSE"
    ],
  • "data": [
    • [
      • "Single LLM",
      • "single-LLM",
      • "gpt-4o",
      • 0.4972,
      • 0.2984,
      • 0.2924,
      • 0.267,
      • 0.3411,
      • 1.1096,
      • 0.9136,
      • 1.1525,
      • 0.8006,
      • 15.2,
      • 0.55,
      • 16.86,
      • 0.63
      ],
    • [
      • "TimeSeries Scientist",
      • "time-series-specific agent",
      • "gpt-4o",
      • 0.2479,
      • 0.2653,
      • 0.2,
      • 0.267,
      • 0.2396,
      • 1.4854,
      • 0.9402,
      • 1.4683,
      • 0.9106,
      • 15.81,
      • 0.52,
      • 17.18,
      • 0.64
      ],
    • [
      • "AgentScope",
      • "general agent",
      • "gpt-4o",
      • 0.4818,
      • 0.2653,
      • 0.2833,
      • 0.2757,
      • 0.3291,
      • 1.0922,
      • 78.9881,
      • 1.1063,
      • 77.4823,
      • 11.05,
      • 0.43,
      • 12.02,
      • 0.49
      ],
    • [
      • "MetaGPT",
      • "general agent",
      • "gpt-4o",
      • 0.486,
      • 0.2733,
      • 0.2691,
      • 0.2199,
      • 0.3156,
      • 1.0675,
      • 72.4145,
      • 1.1706,
      • 82.6721,
      • 14.11,
      • 0.53,
      • 15.4,
      • 0.63
      ],
    • [
      • "CAMEL",
      • "general agent",
      • "gpt-4o",
      • 0.4944,
      • 0.2618,
      • 0.2558,
      • 0.2792,
      • 0.3233,
      • 1.2272,
      • 77.8883,
      • 1.1099,
      • 79.0311,
      • 12.02,
      • 0.55,
      • 15.74,
      • 0.59
      ]
    ],
  • "metadata": null
}