Roblox LLM Leaderboard

Tracking LLM capabilities regarding Roblox game development.

Benchmarks:

  • RobloxQA: Multiple choice question answering about Roblox APIs and concepts.
  • RobloxQA_OpenEnded: Question answering about Roblox APIs and concepts without multiple choices. Response correctness judged by an ensemble of reasoning LLMs by comparing the generated answer to the correct answer.
{
  • "headers": [
    • "Model",
    • "Precision",
    • "Params (B)",
    • "Reasoning",
    • "Open Source",
    • "Average",
    • "RobloxQA",
    • "RobloxQA_OpenEnded"
    ],
  • "data": [
    • [
      • "OpenAI/gpt-o4-mini (high)",
      • null,
      • null,
      • true,
      • false,
      • 63.705,
      • 81.854,
      • 45.557
      ],
    • [
      • "OpenAI/gpt-o4-mini (low)",
      • null,
      • null,
      • true,
      • false,
      • 62.925,
      • 81.377,
      • 44.473
      ],
    • [
      • "Google/Gemini-2.5-Pro-Preview-03-25",
      • null,
      • null,
      • true,
      • false,
      • 61.993,
      • 82.547,
      • 41.439
      ],
    • [
      • "deepseek-ai/DeepSeek-R1",
      • "FP8",
      • 684.531,
      • true,
      • true,
      • 61.841,
      • 81.68,
      • 42.003
      ],
    • [
      • "OpenAI/gpt-4.1",
      • null,
      • null,
      • false,
      • false,
      • 60.888,
      • 80.987,
      • 40.789
      ],
    • [
      • "OpenAI/gpt-o3-mini (high)",
      • null,
      • null,
      • true,
      • false,
      • 60.845,
      • 81.291,
      • 40.399
      ],
    • [
      • "Anthropic/Claude-3.7-Sonnet (Thinking)",
      • null,
      • null,
      • true,
      • false,
      • 60.303,
      • 80.858,
      • 39.749
      ],
    • [
      • "Anthropic/Claude-3.7-Sonnet",
      • null,
      • null,
      • false,
      • false,
      • 58.938,
      • 80.121,
      • 37.755
      ],
    • [
      • "OpenAI/gpt-o3-mini (low)",
      • null,
      • null,
      • true,
      • false,
      • 58.093,
      • 79.688,
      • 36.498
      ],
    • [
      • "Google/Gemini-2.5-Flash-Preview-04-17",
      • null,
      • null,
      • true,
      • false,
      • 57.876,
      • 80.641,
      • 35.111
      ],
    • [
      • "Qwen/Qwen3-235B-A22B",
      • "FP8",
      • 235.094,
      • true,
      • true,
      • 56.793,
      • 78.302,
      • 35.284
      ],
    • [
      • "OpenAI/gpt-4.1-mini",
      • null,
      • null,
      • false,
      • false,
      • 56.576,
      • 79.212,
      • 33.94
      ],
    • [
      • "OpenAI/gpt-4o",
      • null,
      • null,
      • false,
      • false,
      • 56.338,
      • 77.523,
      • 35.154
      ],
    • [
      • "Anthropic/Claude-3.5-Sonnet",
      • null,
      • null,
      • false,
      • false,
      • 56.274,
      • 76.96,
      • 35.587
      ],
    • [
      • "deepseek-ai/DeepSeek-V3",
      • "FP8",
      • 684.531,
      • false,
      • true,
      • 55.601,
      • 77.566,
      • 33.637
      ],
    • [
      • "Qwen/QwQ-32B",
      • "FP16",
      • 32.764,
      • true,
      • true,
      • 54.323,
      • 77.869,
      • 30.776
      ],
    • [
      • "Google/Gemini-2.0-Flash",
      • null,
      • null,
      • false,
      • false,
      • 53.976,
      • 76.7,
      • 31.253
      ],
    • [
      • "boatbomber/Gemma-3-27B-Roblox-Luau",
      • "Q4_K_M",
      • 27.4,
      • false,
      • true,
      • 52.33,
      • 74.838,
      • 29.822
      ],
    • [
      • "OpenAI/gpt-4.1-nano",
      • null,
      • null,
      • false,
      • false,
      • 51.723,
      • 75.227,
      • 28.218
      ],
    • [
      • "deepseek-ai/DeepSeek-R1-Distill-Llama-70B",
      • "FP16",
      • 70.554,
      • true,
      • true,
      • 50.747,
      • 75.574,
      • 25.921
      ],
    • [
      • "Google/Gemini-2.0-Flash-Lite",
      • null,
      • null,
      • false,
      • false,
      • 50.704,
      • 74.924,
      • 26.485
      ],
    • [
      • "OpenAI/gpt-4o-mini",
      • null,
      • null,
      • false,
      • false,
      • 50.228,
      • 74.015,
      • 26.441
      ],
    • [
      • "Qwen/Qwen2.5-72B-Instruct",
      • "FP8",
      • 72.706,
      • false,
      • true,
      • 49.707,
      • 75.314,
      • 24.101
      ],
    • [
      • "meta-llama/Llama-3.3-70B-Instruct",
      • "FP8",
      • 70.554,
      • false,
      • true,
      • 49.621,
      • 75.011,
      • 24.231
      ],
    • [
      • "microsoft/phi-4",
      • "Q6_K",
      • 14.66,
      • false,
      • true,
      • 49.317,
      • 74.924,
      • 23.71
      ],
    • [
      • "google/gemma-3-27b-it",
      • "Q4_K_M",
      • 27.432,
      • false,
      • true,
      • 49.123,
      • 72.239,
      • 26.008
      ],
    • [
      • "mistralai/Mistral-Small-3.1-24B-Instruct-2503",
      • "Q6_K",
      • 24.011,
      • false,
      • true,
      • 48.646,
      • 73.668,
      • 23.624
      ],
    • [
      • "Qwen/Qwen2.5-Coder-32B-Instruct",
      • "FP16",
      • 32.764,
      • false,
      • true,
      • 48.473,
      • 73.105,
      • 23.84
      ],
    • [
      • "mistralai/Mistral-Small-24B-Instruct-2501",
      • "FP16",
      • 23.572,
      • false,
      • true,
      • 48.126,
      • 72.889,
      • 23.364
      ],
    • [
      • "boatbomber/R1-Distill-Qwen-14B-Roblox-Luau",
      • "Q6_K",
      • 14.77,
      • true,
      • true,
      • 46.847,
      • 74.968,
      • 18.726
      ],
    • [
      • "google/gemma-3-12b-it",
      • "Q4_K_M",
      • 12.187,
      • false,
      • true,
      • 46.112,
      • 70.03,
      • 22.193
      ],
    • [
      • "deepseek-ai/DeepSeek-R1-Distill-Qwen-14B",
      • "FP8",
      • 14.77,
      • true,
      • true,
      • 45.894,
      • 72.456,
      • 19.332
      ],
    • [
      • "Qwen/Qwen2.5-7B-Instruct",
      • "FP8",
      • 7.616,
      • false,
      • true,
      • 42.602,
      • 67.692,
      • 17.512
      ],
    • [
      • "internlm/internlm2_5-20b-chat",
      • "Q4_K_M",
      • 19.861,
      • false,
      • true,
      • 40.869,
      • 64.184,
      • 17.555
      ],
    • [
      • "mistralai/Mistral-Nemo-Instruct-2407",
      • "Q4_K_M",
      • 12.248,
      • false,
      • true,
      • 40.674,
      • 64.227,
      • 17.122
      ],
    • [
      • "ibm-granite/granite-3.2-8b-instruct",
      • "Q4_K_M",
      • 8.171,
      • false,
      • true,
      • 40.089,
      • 64.747,
      • 15.431
      ],
    • [
      • "meta-llama/Llama-3.1-8B-Instruct",
      • "FP8",
      • 8.03,
      • false,
      • true,
      • 38.073,
      • 65.786,
      • 10.36
      ],
    • [
      • "mistralai/Mistral-7B-Instruct-v0.3",
      • "FP16",
      • 7.248,
      • false,
      • true,
      • 37.684,
      • 63.014,
      • 12.354
      ],
    • [
      • "meta-llama/Llama-3.2-3B-Instruct",
      • "FP8",
      • 3.213,
      • false,
      • true,
      • 32.766,
      • 59.42,
      • 6.112
      ],
    • [
      • "NousResearch/Hermes-3-Llama-3.2-3B-GGUF",
      • "Q6_K",
      • 3.213,
      • false,
      • true,
      • 31.121,
      • 53.703,
      • 8.539
      ],
    • [
      • "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
      • "FP16",
      • 1.777,
      • true,
      • true,
      • 23.843,
      • 44.435,
      • 3.251
      ],
    • [
      • "boatbomber/Gemma-3-1B-Roblox-Luau",
      • "Q5_K_M",
      • 1.302,
      • false,
      • true,
      • 21.916,
      • 39.974,
      • 3.858
      ],
    • [
      • "google/gemma-3-1b-it",
      • "Q4_K_M",
      • 1,
      • false,
      • true,
      • 21.829,
      • 39.541,
      • 4.118
      ],
    • [
      • "boatbomber/R1-Distill-Qwen-1.5B-Roblox-Luau",
      • "Q8_K",
      • 1.5,
      • true,
      • true,
      • 19.793,
      • 38.025,
      • 1.56
      ],
    • [
      • "google/gemma-3-1b-it-qat-q4_0-gguf",
      • "Q4_0",
      • 1,
      • false,
      • true,
      • 19.296,
      • 33.954,
      • 4.638
      ]
    ],
  • "metadata": null
}