TemporalBench Leaderboard
This leaderboard presents offline evaluation results for agent configurations on the TemporalBench benchmark. It is a pure visualization and validation layer: no agents are executed here, and no LLM APIs are called.
- "headers": [
- "agent_name",
- "agent_type",
- "base_model",
- "T1_acc",
- "T2_acc",
- "T3_acc",
- "T4_acc",
- "overall_mcq_acc",
- "T2_MAE",
- "T2_sMAPE",
- "T4_MAE",
- "T4_sMAPE",
- "MIMIC_T2_OW_sMAPE",
- "MIMIC_T2_OW_RMSSE",
- "MIMIC_T4_OW_sMAPE",
- "MIMIC_T4_OW_RMSSE"
- "data": [
- [
- "Single LLM",
- "single-LLM",
- "gpt-4o",
- 0.4972,
- 0.2984,
- 0.2924,
- 0.267,
- 0.3411,
- 1.1096,
- 0.9136,
- 1.1525,
- 0.8006,
- 15.2,
- 0.55,
- 16.86,
- 0.63
- [
- "TimeSeries Scientist",
- "time-series-specific agent",
- "gpt-4o",
- 0.2479,
- 0.2653,
- 0.2,
- 0.267,
- 0.2396,
- 1.4854,
- 0.9402,
- 1.4683,
- 0.9106,
- 15.81,
- 0.52,
- 17.18,
- 0.64
- [
- "AgentScope",
- "general agent",
- "gpt-4o",
- 0.4818,
- 0.2653,
- 0.2833,
- 0.2757,
- 0.3291,
- 1.0922,
- 78.9881,
- 1.1063,
- 77.4823,
- 11.05,
- 0.43,
- 12.02,
- 0.49
- [
- "MetaGPT",
- "general agent",
- "gpt-4o",
- 0.486,
- 0.2733,
- 0.2691,
- 0.2199,
- 0.3156,
- 1.0675,
- 72.4145,
- 1.1706,
- 82.6721,
- 14.11,
- 0.53,
- 15.4,
- 0.63
- [
- "CAMEL",
- "general agent",
- "gpt-4o",
- 0.4944,
- 0.2618,
- 0.2558,
- 0.2792,
- 0.3233,
- 1.2272,
- 77.8883,
- 1.1099,
- 79.0311,
- 12.02,
- 0.55,
- 15.74,
- 0.59
- [
- "metadata": null
- "headers": [
- "agent_name",
- "agent_type",
- "base_model",
- "T1_acc",
- "T2_acc",
- "T3_acc",
- "T4_acc",
- "FreshRetailNet_T1_acc",
- "FreshRetailNet_T2_acc",
- "FreshRetailNet_T3_acc",
- "FreshRetailNet_T4_acc",
- "PSML_T1_acc",
- "PSML_T2_acc",
- "PSML_T3_acc",
- "PSML_T4_acc",
- "CausalChambers_T1_acc",
- "CausalChambers_T2_acc",
- "CausalChambers_T3_acc",
- "CausalChambers_T4_acc",
- "MIMIC_T1_acc",
- "MIMIC_T2_acc",
- "MIMIC_T3_acc",
- "MIMIC_T4_acc",
- "FreshRetailNet_T2_sMAPE",
- "FreshRetailNet_T2_MAE",
- "PSML_T2_sMAPE",
- "PSML_T2_MAE",
- "CausalChambers_T2_MAE",
- "MIMIC_T2_OW_sMAPE",
- "MIMIC_T2_OW_RMSSE",
- "FreshRetailNet_T4_sMAPE",
- "FreshRetailNet_T4_MAE",
- "PSML_T4_sMAPE",
- "PSML_T4_MAE",
- "CausalChambers_T4_MAE",
- "MIMIC_T4_OW_sMAPE",
- "MIMIC_T4_OW_RMSSE",
- "CausalChambers_T2_OW_RMSSE",
- "CausalChambers_T4_OW_RMSSE"
- "data": [
- [
- "Single LLM",
- "single-LLM",
- "gpt-4o",
- 0.4972,
- 0.2984,
- 0.2924,
- 0.267,
- 0.6364,
- 0.5227,
- 0.0289,
- 0.1364,
- 0.675,
- 0.2067,
- 0.348,
- 0.36,
- 0.1333,
- 0.2733,
- 0.352,
- 0.26,
- 0.4681,
- 0.2128,
- 0.3661,
- 0.2979,
- 1.27,
- 0.12,
- 0.6,
- 0.61,
- 2.48,
- 15.2,
- 0.55,
- 1.29,
- 0.34,
- 0.37,
- 0.44,
- 2.58,
- 16.86,
- 0.63,
- 0.0000257,
- 0.0000269
- [
- "TimeSeries Scientist",
- "time-series-specific agent",
- "gpt-4o",
- 0.2479,
- 0.2653,
- 0.2,
- 0.267,
- 0.3352,
- 0.5682,
- 0.0341,
- 0.5682,
- 0.28,
- 0.2667,
- 0.216,
- 0.2733,
- 0.2867,
- 0.0267,
- 0.216,
- 0.0267,
- 0.1011,
- 0.234,
- 0.2887,
- 0.234,
- 1.27,
- 0.35,
- 0.65,
- 1.53,
- 2.44,
- 15.81,
- 0.52,
- 1.4,
- 0.51,
- 0.48,
- 0.84,
- 2.94,
- 17.18,
- 0.64,
- 0.0000253,
- 0.0000306
- [
- "AgentScope",
- "general agent",
- "gpt-4o",
- 0.4818,
- 0.2653,
- 0.2833,
- 0.2757,
- 0.625,
- 0.1212,
- 0.1364,
- 0.1894,
- 0.66,
- 0.2467,
- 0.272,
- 0.3533,
- 0.12,
- 0.46,
- 0.44,
- 0.32,
- 0.4468,
- 0.2128,
- 0.2395,
- 0.227,
- 126.27,
- 0.12,
- 37.38,
- 0.28,
- 2.76,
- 11.05,
- 0.43,
- 130.86,
- 0.2,
- 30.51,
- 0.35,
- 2.66,
- 12.02,
- 0.49,
- 0.00262,
- 0.00246
- [
- "MetaGPT",
- "general agent",
- "gpt-4o",
- 0.486,
- 0.2733,
- 0.2691,
- 0.2199,
- 0.625,
- 0.0909,
- 0.0511,
- 0.1439,
- 0.675,
- 0.2109,
- 0.22,
- 0.3133,
- 0.1067,
- 0.5933,
- 0.452,
- 0.16,
- 0.4574,
- 0.1702,
- 0.2897,
- 0.2553,
- 126.59,
- 0.13,
- 24.74,
- 0.34,
- 2.62,
- 14.11,
- 0.53,
- 127.22,
- 0.24,
- 43.47,
- 0.4,
- 2.76,
- 15.4,
- 0.63,
- 0.00272,
- 0.00287
- [
- "CAMEL",
- "general agent",
- "gpt-4o",
- 0.4944,
- 0.2618,
- 0.2558,
- 0.2792,
- 0.642,
- 0.0076,
- 0.0625,
- 0.3106,
- 0.685,
- 0.14,
- 0.184,
- 0.3067,
- 0.1,
- 0.66,
- 0.42,
- 0.2667,
- 0.4681,
- 0.2057,
- 0.3014,
- 0.234,
- 126.75,
- 0.13,
- 34.89,
- 0.43,
- 2.99,
- 12.02,
- 0.55,
- 128.18,
- 0.28,
- 35.78,
- 0.45,
- 2.5,
- 15.74,
- 0.59,
- 0.00311,
- 0.0026
- [
- "metadata": null
Upload submission files for manual review.
Required files:
results_on_dev_dataset.json: task-level metrics in leaderboard format.results_on_test_dataset.json: per-example test outputs with at leastid,tier,source_dataset,label, andoutput(required when the sample contains forecasting).
Please also include model architecture code and LLM/system details for verification.
Example record (JSON):
{
"agent_name": "Single LLM",
"agent_type": "single-LLM",
"base_model": "gpt-4o",
"T1_acc": null,
"T2_acc": null,
"T3_acc": null,
"T4_acc": null,
"FreshRetailNet_T1_acc": 0.6364,
"FreshRetailNet_T2_acc": 0.5227,
"FreshRetailNet_T3_acc": 0.0289,
"FreshRetailNet_T4_acc": 0.1364,
"PSML_T1_acc": 0.675,
"PSML_T2_acc": 0.2067,
"PSML_T3_acc": 0.348,
"PSML_T4_acc": 0.36,
"CausalChambers_T1_acc": 0.1333,
"CausalChambers_T2_acc": 0.2733,
"CausalChambers_T3_acc": 0.352,
"CausalChambers_T4_acc": 0.26,
"MIMIC_T1_acc": 0.4681,
"MIMIC_T2_acc": 0.2128,
"MIMIC_T3_acc": 0.3661,
"MIMIC_T4_acc": 0.2979,
"T2_sMAPE": null,
"T2_MAE": null,
"T2_OW_sMAPE_MIMIC": null,
"T2_OW_RMSSE_MIMIC": null,
"T4_sMAPE": null,
"T4_MAE": null,
"T4_OW_sMAPE_MIMIC": null,
"T4_OW_RMSSE_MIMIC": null,
"FreshRetailNet_T2_MAE": 0.12,
"FreshRetailNet_T2_sMAPE": 1.27,
"FreshRetailNet_T4_MAE": 0.34,
"FreshRetailNet_T4_sMAPE": 1.29,
"PSML_T2_MAE": 0.61,
"PSML_T2_sMAPE": 0.6,
"PSML_T4_MAE": 0.44,
"PSML_T4_sMAPE": 0.37,
"CausalChambers_T2_MAE": 2.48,
"CausalChambers_T2_OW_RMSSE": 2.57e-05,
"CausalChambers_T4_MAE": 2.58,
"CausalChambers_T4_OW_RMSSE": 2.69e-05,
"MIMIC_T2_OW_sMAPE": 15.2,
"MIMIC_T2_OW_RMSSE": 0.55,
"MIMIC_T4_OW_sMAPE": 16.86,
"MIMIC_T4_OW_RMSSE": 0.63
}
The paper describing this benchmark is TemporalBench: A Benchmark for Evaluating LLM-Based Agents on Contextual and Event-Informed Time Series Tasks (https://arxiv.org/abs/2602.13272). We also maintain a public leaderboard and welcome submissions from state-of-the-art models: https://huggingface.co/spaces/Melady/TemporalBench_Leaderboard
What this leaderboard shows
- One row per evaluated agent configuration
- Task-family MCQ metrics for TemporalBench (T1โT4)
- Forecasting metrics for T2/T4 (sMAPE, MAE) and MIMIC OW metrics when provided
- Dataset-level results for: FreshRetailNet, PSML, Causal Chambers, MIMIC
Data requirements
Results are loaded from a local JSON or CSV file. Each record must include:
- Identity fields:
agent_name,agent_type,base_model - Required metrics:
T1_acc,T2_acc,T3_acc,T4_acc(computed overall) - Optional metrics:
- Overall forecasting:
T2_sMAPE,T2_MAE,T4_sMAPE,T4_MAE - MIMIC overall OW:
MIMIC_T2_OW_sMAPE,MIMIC_T2_OW_RMSSE,MIMIC_T4_OW_sMAPE,MIMIC_T4_OW_RMSSE - Dataset-level metrics:
<Dataset>_T{1..4}_accand forecasting metrics per dataset
- Overall forecasting:
Overall computation
Overall T1โT4 accuracy and T2/T4 forecasting metrics are computed as weighted averages from dataset-level results using question/series counts. Missing values are ignored.
Submission workflow
Uploads are stored locally for manual review.
For a valid submission, please provide two files:
results_on_dev_dataset.json- This follows the leaderboard metrics format.
- It should include task-level metrics only (e.g., T1-T4 and forecasting metrics).
results_on_test_dataset.json- This should include per-example outputs on the test split.
- For each example, include at least:
idtiersource_datasetlabeloutput(required when the example contains a forecasting task)
We also strongly encourage including model and system metadata, such as:
- model architecture code
- LLM(s) used
- key implementation details needed for result verification
Approved submissions should then be merged into the main results file to appear on the leaderboard.
Data access
The dataset is available at:
https://huggingface.co/datasets/Melady/TemporalBench
It includes all test tasks and a forecast_metrics_utils.py file that documents the
standard metric computation utilities.
Citation
Copy the following snippet to cite these results
@misc{weng2026temporalbenchbenchmarkevaluatingllmbased,
title={TemporalBench: A Benchmark for Evaluating LLM-Based Agents on Contextual and Event-Informed Time Series Tasks},
author={Muyan Weng and Defu Cao and Wei Yang and Yashaswi Sharma and Yan Liu},
year={2026},
eprint={2602.13272},
archivePrefix={arXiv},
primaryClass={cs.AI},
url={https://arxiv.org/abs/2602.13272},
}