MMTU leaderboard

Tables and table-based use cases play a crucial role in many real-world applications, such as spreadsheets, databases, and computational notebooks, which traditionally require expert-level users like data engineers, analysts, and database administrators to operate. Although LLMs have shown remarkable progress in working with tables, comprehensive benchmarking of such capabilities remains limited, often narrowly focusing on tasks like NL-to-SQL and Table-QA, while overlooking the broader spectrum of real-world tasks that professional users face today.

We introduce MMTU, a large-scale benchmark with around 28K questions across 25 real-world table tasks, designed to comprehensively evaluate models ability to understand, reason, and manipulate real tables at the expert-level. These tasks are drawn from decades' worth of computer science research on tabular data, with a focus on complex table tasks faced by professional users. We show that MMTU require a combination of skills -- including table understanding, reasoning, and coding -- that remain challenging for today's frontier models, where even frontier reasoning models like OpenAI GPT-5 and DeepSeek R1 score only around 69% and 57% respectively, suggesting significant room for improvement.

{

"headers": [
- "Model",
- "Model type",
- "Overall",
- "Column Relationship",
- "Column Transform",
- "Data Cleaning",
- "KB mapping",
- "NL-2-code",
- "Table Join",
- "Table Matching",
- "Table QA",
- "Table Transform",
- "Table Understanding",
- "Model size",
- "Data Source",
- "Date"
],
"data": [
- [
  - "gpt-5",
  - "Reasoning",
  - 0.696,
  - 0.648,
  - 0.521,
  - 0.511,
  - 0.612,
  - 0.711,
  - 0.76,
  - 0.916,
  - 0.895,
  - 0.631,
  - 0.929,
  - "unknown",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "o3",
  - "Reasoning",
  - 0.691,
  - 0.687,
  - 0.52,
  - 0.518,
  - 0.614,
  - 0.71,
  - 0.765,
  - 0.911,
  - 0.851,
  - 0.608,
  - 0.883,
  - "unknown",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "gpt-5-mini",
  - "Reasoning",
  - 0.667,
  - 0.585,
  - 0.513,
  - 0.492,
  - 0.528,
  - 0.698,
  - 0.771,
  - 0.895,
  - 0.877,
  - 0.595,
  - 0.932,
  - "unknown",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "Gemini-2.5-pro",
  - "Reasoning",
  - 0.665,
  - 0.57,
  - 0.486,
  - 0.546,
  - 0.608,
  - 0.717,
  - 0.713,
  - 0.884,
  - 0.876,
  - 0.583,
  - 0.842,
  - "unknown",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "o4-mini (2024-11-20)",
  - "Reasoning",
  - 0.66,
  - 0.61,
  - 0.48,
  - 0.46,
  - 0.503,
  - 0.688,
  - 0.765,
  - 0.907,
  - 0.887,
  - 0.598,
  - 0.914,
  - "unknown",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "Grok-3-mini",
  - "Reasoning",
  - 0.645,
  - 0.679,
  - 0.465,
  - 0.419,
  - 0.548,
  - 0.686,
  - 0.771,
  - 0.888,
  - 0.859,
  - 0.538,
  - 0.79,
  - "unknown",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "Gemini-2.5-flash",
  - "Reasoning",
  - 0.625,
  - 0.596,
  - 0.438,
  - 0.437,
  - 0.472,
  - 0.707,
  - 0.733,
  - 0.889,
  - 0.849,
  - 0.544,
  - 0.82,
  - "unknown",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "Deepseek-R1-0528",
  - "Reasoning",
  - 0.58,
  - 0.581,
  - 0.451,
  - 0.399,
  - 0.49,
  - 0.681,
  - 0.767,
  - 0.86,
  - 0.857,
  - 0.383,
  - 0.538,
  - "685B",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "gpt-5-chat",
  - "Chat",
  - 0.577,
  - 0.452,
  - 0.416,
  - 0.456,
  - 0.543,
  - 0.704,
  - 0.759,
  - 0.874,
  - 0.728,
  - 0.445,
  - 0.591,
  - "unknown",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "gpt-oss-120b",
  - "Reasoning",
  - 0.543,
  - 0.526,
  - 0.463,
  - 0.253,
  - 0.423,
  - 0.67,
  - 0.641,
  - 0.777,
  - 0.86,
  - 0.362,
  - 0.747,
  - "120B",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "Qwen3-235B-A22B-Thinking-2507",
  - "Reasoning",
  - 0.529,
  - 0.276,
  - 0.437,
  - 0.26,
  - 0.476,
  - 0.694,
  - 0.606,
  - 0.829,
  - 0.831,
  - 0.489,
  - 0.673,
  - "235B",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "Qwen3-235B-A22B-Instruct-2507",
  - "Chat",
  - 0.524,
  - 0.442,
  - 0.378,
  - 0.426,
  - 0.413,
  - 0.654,
  - 0.667,
  - 0.837,
  - 0.677,
  - 0.375,
  - 0.576,
  - "235B",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "GPT-4o (2024-11-20)",
  - "Chat",
  - 0.507,
  - 0.317,
  - 0.369,
  - 0.445,
  - 0.498,
  - 0.578,
  - 0.661,
  - 0.847,
  - 0.695,
  - 0.341,
  - 0.47,
  - "unknown",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "Qwen3-32B",
  - "Reasoning",
  - 0.506,
  - 0.496,
  - 0.397,
  - 0.338,
  - 0.358,
  - 0.615,
  - 0.703,
  - 0.839,
  - 0.825,
  - 0.388,
  - 0.265,
  - "32B",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "Llama-4-Maverick-17B-128E-Instruct-FP8",
  - "Chat",
  - 0.49,
  - 0.372,
  - 0.361,
  - 0.402,
  - 0.42,
  - 0.637,
  - 0.674,
  - 0.815,
  - 0.798,
  - 0.238,
  - 0.421,
  - "402B",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "gpt-oss-20b",
  - "Reasoning",
  - 0.478,
  - 0.424,
  - 0.376,
  - 0.18,
  - 0.305,
  - 0.623,
  - 0.539,
  - 0.78,
  - 0.828,
  - 0.343,
  - 0.685,
  - "20B",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "Qwen3-8B",
  - "Reasoning",
  - 0.477,
  - 0.408,
  - 0.339,
  - 0.326,
  - 0.356,
  - 0.604,
  - 0.689,
  - 0.82,
  - 0.808,
  - 0.398,
  - 0.193,
  - "8B",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "Llama3.3-70B",
  - "Chat",
  - 0.454,
  - 0.338,
  - 0.325,
  - 0.386,
  - 0.436,
  - 0.595,
  - 0.653,
  - 0.809,
  - 0.66,
  - 0.311,
  - 0.161,
  - "70B",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "Mistral-Large-2411",
  - "Chat",
  - 0.446,
  - 0.309,
  - 0.367,
  - 0.373,
  - 0.457,
  - 0.563,
  - 0.672,
  - 0.83,
  - 0.646,
  - 0.283,
  - 0.048,
  - "123B",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "Mistral-Small-2503",
  - "Chat",
  - 0.417,
  - 0.319,
  - 0.306,
  - 0.284,
  - 0.377,
  - 0.571,
  - 0.667,
  - 0.775,
  - 0.621,
  - 0.242,
  - 0.185,
  - "24B",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "GPT-4o-mini",
  - "Chat",
  - 0.4,
  - 0.268,
  - 0.313,
  - 0.262,
  - 0.326,
  - 0.565,
  - 0.558,
  - 0.774,
  - 0.553,
  - 0.308,
  - 0.225,
  - "unknown",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "Llama-4-Scout-17B-16E-Instruct",
  - "Chat",
  - 0.393,
  - 0.272,
  - 0.21,
  - 0.289,
  - 0.355,
  - 0.56,
  - 0.633,
  - 0.76,
  - 0.74,
  - 0.186,
  - 0.157,
  - "109B",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "Qwen3-32B (no_thinking)",
  - "Chat",
  - 0.379,
  - 0.302,
  - 0.294,
  - 0.29,
  - 0.282,
  - 0.467,
  - 0.554,
  - 0.791,
  - 0.571,
  - 0.199,
  - 0.14,
  - "32B",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "Qwen3-8B (no_thinking)",
  - "Chat",
  - 0.353,
  - 0.265,
  - 0.231,
  - 0.268,
  - 0.299,
  - 0.44,
  - 0.63,
  - 0.727,
  - 0.526,
  - 0.175,
  - 0.091,
  - "8B",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "Qwen2.5-7B-Instruct",
  - "Chat",
  - 0.31,
  - 0.186,
  - 0.168,
  - 0.213,
  - 0.258,
  - 0.491,
  - 0.525,
  - 0.751,
  - 0.512,
  - 0.149,
  - 0,
  - "7B",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ],
- [
  - "Llama-3.1-8B",
  - "Chat",
  - 0.268,
  - 0.16,
  - 0.197,
  - 0.148,
  - 0.264,
  - 0.376,
  - 0.375,
  - 0.623,
  - 0.47,
  - 0.114,
  - 0,
  - "8B",
  - "MMTU",
  - "2025-10-20 00:00:00"
  ]
],
"metadata": null

}

Copy the following snippet to cite MMTU

MMTU leaderboard

Submit on MMTU Leaderboard Introduction

⚠ Please note that you need to submit the JSONL file with your model output.