{
  "benchmarks": {
    "version": "1.0",
    "runDate": "2026-04-18T00:00:00.000Z",
    "methodology": "Aggregated scores from published third-party benchmarks. SWE-bench measures real GitHub issue resolution. LiveCodeBench measures competitive programming ability. HumanEval measures basic code generation. BigCodeBench measures practical, multi-step coding tasks. All scores normalized to 0-100 scale.",
    "sources": [
      {
        "name": "SWE-bench Verified",
        "url": "https://www.swebench.com/",
        "description": "Resolving real GitHub issues in production codebases"
      },
      {
        "name": "LiveCodeBench",
        "url": "https://livecodebench.github.io/",
        "description": "Competitive programming problems"
      },
      {
        "name": "HumanEval",
        "url": "https://github.com/openai/human-eval",
        "description": "Function-level code generation"
      },
      {
        "name": "BigCodeBench",
        "url": "https://huggingface.co/spaces/bigcode/bigcodebench",
        "description": "Practical, multi-step coding tasks"
      }
    ],
    "models": [
      {
        "modelId": "claude-sonnet-4",
        "modelName": "Claude Sonnet 4",
        "provider": "Anthropic",
        "overallScore": 78,
        "sweBench": 74,
        "liveCodeBench": 82,
        "humanEval": 92,
        "bigCodeBench": 64,
        "strengths": [
          "Price-performance leader",
          "Strong at web development",
          "Excellent code review"
        ],
        "weaknesses": [
          "Struggles with complex algorithms",
          "Less consistent on system design"
        ],
        "pricePerScorePoint": 0.038
      },
      {
        "modelId": "claude-opus-4",
        "modelName": "Claude Opus 4",
        "provider": "Anthropic",
        "overallScore": 86,
        "sweBench": 84,
        "liveCodeBench": 88,
        "humanEval": 96,
        "bigCodeBench": 76,
        "strengths": [
          "Best at complex reasoning",
          "Strong system design",
          "Excellent debugging"
        ],
        "weaknesses": [
          "Expensive for bulk tasks",
          "Slower response times"
        ],
        "pricePerScorePoint": 0.174
      },
      {
        "modelId": "claude-3-5-sonnet",
        "modelName": "Claude 3.5 Sonnet",
        "provider": "Anthropic",
        "overallScore": 72,
        "sweBench": 68,
        "liveCodeBench": 75,
        "humanEval": 90,
        "bigCodeBench": 58,
        "strengths": [
          "Balanced performance",
          "Computer use capability",
          "Artifact generation"
        ],
        "weaknesses": [
          "Older architecture",
          "Falling behind Sonnet 4"
        ],
        "pricePerScorePoint": 0.042
      },
      {
        "modelId": "claude-3-5-haiku",
        "modelName": "Claude 3.5 Haiku",
        "provider": "Anthropic",
        "overallScore": 52,
        "sweBench": 45,
        "liveCodeBench": 55,
        "humanEval": 75,
        "bigCodeBench": 38,
        "strengths": [
          "Fastest Claude model",
          "Cheapest option",
          "Good for code review"
        ],
        "weaknesses": [
          "Struggles with complex tasks",
          "Limited reasoning depth"
        ],
        "pricePerScorePoint": 0.015
      },
      {
        "modelId": "claude-4-haiku",
        "modelName": "Claude 4 Haiku",
        "provider": "Anthropic",
        "overallScore": 55,
        "sweBench": 48,
        "liveCodeBench": 58,
        "humanEval": 78,
        "bigCodeBench": 40,
        "strengths": [
          "Fast",
          "Good value",
          "Improved over 3.5 Haiku"
        ],
        "weaknesses": [
          "Limited reasoning depth"
        ],
        "pricePerScorePoint": 0.015
      },
      {
        "modelId": "claude-sonnet-4-lite",
        "modelName": "Claude Sonnet 4 Lite",
        "provider": "Anthropic",
        "overallScore": 70,
        "sweBench": 64,
        "liveCodeBench": 74,
        "humanEval": 88,
        "bigCodeBench": 56,
        "strengths": [
          "Good value",
          "Strong coding"
        ],
        "weaknesses": [
          "Less capable than full Sonnet 4"
        ],
        "pricePerScorePoint": 0.014
      },
      {
        "modelId": "claude-3-opus",
        "modelName": "Claude 3 Opus",
        "provider": "Anthropic",
        "overallScore": 78,
        "sweBench": 74,
        "liveCodeBench": 80,
        "humanEval": 94,
        "bigCodeBench": 64,
        "strengths": [
          "Strong reasoning",
          "Proven track record"
        ],
        "weaknesses": [
          "Older generation",
          "Expensive"
        ],
        "pricePerScorePoint": 0.192
      },
      {
        "modelId": "claude-3-sonnet",
        "modelName": "Claude 3 Sonnet",
        "provider": "Anthropic",
        "overallScore": 65,
        "sweBench": 58,
        "liveCodeBench": 68,
        "humanEval": 85,
        "bigCodeBench": 50,
        "strengths": [
          "Reliable",
          "Good value"
        ],
        "weaknesses": [
          "Two generations behind"
        ],
        "pricePerScorePoint": 0.046
      },
      {
        "modelId": "claude-3-haiku",
        "modelName": "Claude 3 Haiku",
        "provider": "Anthropic",
        "overallScore": 45,
        "sweBench": 38,
        "liveCodeBench": 46,
        "humanEval": 68,
        "bigCodeBench": 30,
        "strengths": [
          "Very cheap",
          "Fast"
        ],
        "weaknesses": [
          "Basic capability only"
        ],
        "pricePerScorePoint": 0.006
      },
      {
        "modelId": "gpt-4o",
        "modelName": "GPT-4o",
        "provider": "OpenAI",
        "overallScore": 75,
        "sweBench": 70,
        "liveCodeBench": 78,
        "humanEval": 90,
        "bigCodeBench": 62,
        "strengths": [
          "Strong general-purpose",
          "Good multimodal"
        ],
        "weaknesses": [
          "Less consistent on coding than Claude"
        ],
        "pricePerScorePoint": 0.033
      },
      {
        "modelId": "gpt-4o-mini",
        "modelName": "GPT-4o Mini",
        "provider": "OpenAI",
        "overallScore": 58,
        "sweBench": 50,
        "liveCodeBench": 60,
        "humanEval": 78,
        "bigCodeBench": 44,
        "strengths": [
          "Very cheap",
          "Fast responses"
        ],
        "weaknesses": [
          "Struggles with multi-step reasoning"
        ],
        "pricePerScorePoint": 0.003
      },
      {
        "modelId": "gpt-4.1",
        "modelName": "GPT-4.1",
        "provider": "OpenAI",
        "overallScore": 80,
        "sweBench": 76,
        "liveCodeBench": 82,
        "humanEval": 94,
        "bigCodeBench": 68,
        "strengths": [
          "Latest GPT model",
          "Strong across all benchmarks"
        ],
        "weaknesses": [
          "Premium pricing"
        ],
        "pricePerScorePoint": 0.063
      },
      {
        "modelId": "gpt-4.1-mini",
        "modelName": "GPT-4.1 Mini",
        "provider": "OpenAI",
        "overallScore": 68,
        "sweBench": 62,
        "liveCodeBench": 70,
        "humanEval": 86,
        "bigCodeBench": 54,
        "strengths": [
          "Good value",
          "Latest architecture"
        ],
        "weaknesses": [
          "Mini variant limitations"
        ],
        "pricePerScorePoint": 0.022
      },
      {
        "modelId": "gpt-4-turbo",
        "modelName": "GPT-4 Turbo",
        "provider": "OpenAI",
        "overallScore": 70,
        "sweBench": 64,
        "liveCodeBench": 72,
        "humanEval": 88,
        "bigCodeBench": 56,
        "strengths": [
          "Proven model",
          "Large context"
        ],
        "weaknesses": [
          "Superseded by GPT-4o"
        ],
        "pricePerScorePoint": 0.133
      },
      {
        "modelId": "gpt-4",
        "modelName": "GPT-4",
        "provider": "OpenAI",
        "overallScore": 68,
        "sweBench": 60,
        "liveCodeBench": 70,
        "humanEval": 86,
        "bigCodeBench": 54,
        "strengths": [
          "Original breakthrough model"
        ],
        "weaknesses": [
          "Two generations behind",
          "Expensive"
        ],
        "pricePerScorePoint": 0.441
      },
      {
        "modelId": "gpt-3.5-turbo",
        "modelName": "GPT-3.5 Turbo",
        "provider": "OpenAI",
        "overallScore": 40,
        "sweBench": 32,
        "liveCodeBench": 42,
        "humanEval": 62,
        "bigCodeBench": 26,
        "strengths": [
          "Ultra-cheap",
          "Very fast"
        ],
        "weaknesses": [
          "Basic coding only"
        ],
        "pricePerScorePoint": 0.013
      },
      {
        "modelId": "o1",
        "modelName": "o1",
        "provider": "OpenAI",
        "overallScore": 83,
        "sweBench": 80,
        "liveCodeBench": 84,
        "humanEval": 95,
        "bigCodeBench": 73,
        "strengths": [
          "Strong step-by-step reasoning",
          "Best at math-heavy coding"
        ],
        "weaknesses": [
          "Expensive",
          "Slow"
        ],
        "pricePerScorePoint": 0.181
      },
      {
        "modelId": "o1-mini",
        "modelName": "o1-mini",
        "provider": "OpenAI",
        "overallScore": 70,
        "sweBench": 64,
        "liveCodeBench": 72,
        "humanEval": 90,
        "bigCodeBench": 54,
        "strengths": [
          "Reasoning at lower cost",
          "Good for competitive programming"
        ],
        "weaknesses": [
          "Slower than standard models"
        ],
        "pricePerScorePoint": 0.016
      },
      {
        "modelId": "o3-mini",
        "modelName": "o3-mini",
        "provider": "OpenAI",
        "overallScore": 80,
        "sweBench": 76,
        "liveCodeBench": 85,
        "humanEval": 94,
        "bigCodeBench": 65,
        "strengths": [
          "Excellent at competitive programming",
          "Strong algorithmic reasoning"
        ],
        "weaknesses": [
          "Optimized for reasoning, not chat"
        ],
        "pricePerScorePoint": 0.014
      },
      {
        "modelId": "o3",
        "modelName": "o3",
        "provider": "OpenAI",
        "overallScore": 85,
        "sweBench": 82,
        "liveCodeBench": 88,
        "humanEval": 96,
        "bigCodeBench": 74,
        "strengths": [
          "Latest reasoning model",
          "Top-tier across all benchmarks"
        ],
        "weaknesses": [
          "Very expensive",
          "Slow"
        ],
        "pricePerScorePoint": 0.167
      },
      {
        "modelId": "o4-mini",
        "modelName": "o4-mini",
        "provider": "OpenAI",
        "overallScore": 72,
        "sweBench": 66,
        "liveCodeBench": 74,
        "humanEval": 92,
        "bigCodeBench": 56,
        "strengths": [
          "Improved reasoning at mini price"
        ],
        "weaknesses": [
          "New model, limited data"
        ],
        "pricePerScorePoint": 0.015
      },
      {
        "modelId": "gemini-2.5-pro",
        "modelName": "Gemini 2.5 Pro",
        "provider": "Google",
        "overallScore": 76,
        "sweBench": 72,
        "liveCodeBench": 79,
        "humanEval": 89,
        "bigCodeBench": 64,
        "strengths": [
          "Large context window (1M tokens)",
          "Good at code understanding"
        ],
        "weaknesses": [
          "Less consistent code generation"
        ],
        "pricePerScorePoint": 0.016
      },
      {
        "modelId": "gemini-2.5-flash",
        "modelName": "Gemini 2.5 Flash",
        "provider": "Google",
        "overallScore": 65,
        "sweBench": 58,
        "liveCodeBench": 68,
        "humanEval": 85,
        "bigCodeBench": 50,
        "strengths": [
          "Very cheap",
          "Fast",
          "Large context"
        ],
        "weaknesses": [
          "Weaker reasoning than Pro"
        ],
        "pricePerScorePoint": 0.002
      },
      {
        "modelId": "gemini-2.0-flash",
        "modelName": "Gemini 2.0 Flash",
        "provider": "Google",
        "overallScore": 55,
        "sweBench": 48,
        "liveCodeBench": 56,
        "humanEval": 78,
        "bigCodeBench": 40,
        "strengths": [
          "Ultra-cheap",
          "Fast"
        ],
        "weaknesses": [
          "Basic coding ability"
        ],
        "pricePerScorePoint": 0.002
      },
      {
        "modelId": "gemini-2.0-pro",
        "modelName": "Gemini 2.0 Pro",
        "provider": "Google",
        "overallScore": 68,
        "sweBench": 62,
        "liveCodeBench": 70,
        "humanEval": 86,
        "bigCodeBench": 54,
        "strengths": [
          "Solid performance",
          "Good context"
        ],
        "weaknesses": [
          "Superseded by 2.5"
        ],
        "pricePerScorePoint": 0.018
      },
      {
        "modelId": "gemini-1.5-pro",
        "modelName": "Gemini 1.5 Pro",
        "provider": "Google",
        "overallScore": 62,
        "sweBench": 56,
        "liveCodeBench": 64,
        "humanEval": 82,
        "bigCodeBench": 46,
        "strengths": [
          "Large context",
          "Established model"
        ],
        "weaknesses": [
          "Older generation"
        ],
        "pricePerScorePoint": 0.02
      },
      {
        "modelId": "gemini-1.5-flash",
        "modelName": "Gemini 1.5 Flash",
        "provider": "Google",
        "overallScore": 50,
        "sweBench": 42,
        "liveCodeBench": 52,
        "humanEval": 72,
        "bigCodeBench": 36,
        "strengths": [
          "Cheap",
          "Fast"
        ],
        "weaknesses": [
          "Basic capability"
        ],
        "pricePerScorePoint": 0.001
      },
      {
        "modelId": "qwen-max",
        "modelName": "Qwen Max",
        "provider": "Qwen",
        "overallScore": 68,
        "sweBench": 62,
        "liveCodeBench": 70,
        "humanEval": 86,
        "bigCodeBench": 54,
        "strengths": [
          "Strong Chinese language support",
          "Good value"
        ],
        "weaknesses": [
          "Less tested on English coding"
        ],
        "pricePerScorePoint": 0.024
      },
      {
        "modelId": "qwen-plus",
        "modelName": "Qwen Plus",
        "provider": "Qwen",
        "overallScore": 55,
        "sweBench": 48,
        "liveCodeBench": 58,
        "humanEval": 78,
        "bigCodeBench": 40,
        "strengths": [
          "Budget-friendly"
        ],
        "weaknesses": [
          "Average performance"
        ],
        "pricePerScorePoint": 0.007
      },
      {
        "modelId": "qwen-turbo",
        "modelName": "Qwen Turbo",
        "provider": "Qwen",
        "overallScore": 42,
        "sweBench": 35,
        "liveCodeBench": 44,
        "humanEval": 65,
        "bigCodeBench": 28,
        "strengths": [
          "Cheapest option",
          "Fast"
        ],
        "weaknesses": [
          "Basic coding only"
        ],
        "pricePerScorePoint": 0.002
      },
      {
        "modelId": "qwen-coder",
        "modelName": "Qwen Coder",
        "provider": "Qwen",
        "overallScore": 60,
        "sweBench": 54,
        "liveCodeBench": 62,
        "humanEval": 82,
        "bigCodeBench": 45,
        "strengths": [
          "Code-specialized"
        ],
        "weaknesses": [
          "Weaker at reasoning"
        ],
        "pricePerScorePoint": 0.007
      },
      {
        "modelId": "qwen3-6-plus",
        "modelName": "Qwen3 6 Plus",
        "provider": "Qwen",
        "overallScore": 72,
        "sweBench": 66,
        "liveCodeBench": 74,
        "humanEval": 88,
        "bigCodeBench": 58,
        "strengths": [
          "Latest Qwen architecture",
          "Strong performance"
        ],
        "weaknesses": [
          "Newer model"
        ],
        "pricePerScorePoint": 0.011
      },
      {
        "modelId": "deepseek-chat",
        "modelName": "DeepSeek Chat (V3)",
        "provider": "DeepSeek",
        "overallScore": 62,
        "sweBench": 56,
        "liveCodeBench": 64,
        "humanEval": 84,
        "bigCodeBench": 46,
        "strengths": [
          "Excellent value",
          "Strong coding focus"
        ],
        "weaknesses": [
          "Less general-purpose"
        ],
        "pricePerScorePoint": 0.004
      },
      {
        "modelId": "deepseek-reasoner",
        "modelName": "DeepSeek Reasoner (R1)",
        "provider": "DeepSeek",
        "overallScore": 72,
        "sweBench": 68,
        "liveCodeBench": 76,
        "humanEval": 90,
        "bigCodeBench": 56,
        "strengths": [
          "Strong reasoning chain",
          "Good value"
        ],
        "weaknesses": [
          "Slow on simple tasks"
        ],
        "pricePerScorePoint": 0.008
      },
      {
        "modelId": "deepseek-coder",
        "modelName": "DeepSeek Coder V2",
        "provider": "DeepSeek",
        "overallScore": 58,
        "sweBench": 50,
        "liveCodeBench": 60,
        "humanEval": 82,
        "bigCodeBench": 42,
        "strengths": [
          "Code-specialized",
          "Very cheap"
        ],
        "weaknesses": [
          "Limited general-purpose"
        ],
        "pricePerScorePoint": 0.005
      },
      {
        "modelId": "mistral-large-2",
        "modelName": "Mistral Large 2",
        "provider": "Mistral",
        "overallScore": 65,
        "sweBench": 58,
        "liveCodeBench": 66,
        "humanEval": 84,
        "bigCodeBench": 52,
        "strengths": [
          "European data residency",
          "Good value"
        ],
        "weaknesses": [
          "Smaller ecosystem"
        ],
        "pricePerScorePoint": 0.031
      },
      {
        "modelId": "mistral-codestral",
        "modelName": "Mistral Codestral",
        "provider": "Mistral",
        "overallScore": 60,
        "sweBench": 54,
        "liveCodeBench": 64,
        "humanEval": 82,
        "bigCodeBench": 44,
        "strengths": [
          "Code-specialized",
          "Very cheap"
        ],
        "weaknesses": [
          "Narrow focus"
        ],
        "pricePerScorePoint": 0.005
      },
      {
        "modelId": "mistral-nemo",
        "modelName": "Mistral Nemo",
        "provider": "Mistral",
        "overallScore": 48,
        "sweBench": 40,
        "liveCodeBench": 50,
        "humanEval": 70,
        "bigCodeBench": 32,
        "strengths": [
          "Open weight",
          "Self-hostable"
        ],
        "weaknesses": [
          "Basic coding ability"
        ],
        "pricePerScorePoint": 0.002
      },
      {
        "modelId": "mistral-small",
        "modelName": "Mistral Small",
        "provider": "Mistral",
        "overallScore": 42,
        "sweBench": 35,
        "liveCodeBench": 44,
        "humanEval": 65,
        "bigCodeBench": 28,
        "strengths": [
          "Ultra-cheap"
        ],
        "weaknesses": [
          "Very limited capabilities"
        ],
        "pricePerScorePoint": 0.002
      },
      {
        "modelId": "grok-3",
        "modelName": "Grok 3",
        "provider": "xAI",
        "overallScore": 70,
        "sweBench": 64,
        "liveCodeBench": 72,
        "humanEval": 88,
        "bigCodeBench": 56,
        "strengths": [
          "Strong reasoning",
          "X integration"
        ],
        "weaknesses": [
          "Newer model",
          "Limited ecosystem"
        ],
        "pricePerScorePoint": 0.043
      },
      {
        "modelId": "grok-3-mini",
        "modelName": "Grok 3 Mini",
        "provider": "xAI",
        "overallScore": 50,
        "sweBench": 42,
        "liveCodeBench": 52,
        "humanEval": 72,
        "bigCodeBench": 36,
        "strengths": [
          "Budget option",
          "Fast"
        ],
        "weaknesses": [
          "Limited capabilities"
        ],
        "pricePerScorePoint": 0.006
      },
      {
        "modelId": "grok-code-fast-1",
        "modelName": "Grok Code Fast 1",
        "provider": "xAI",
        "overallScore": 65,
        "sweBench": 58,
        "liveCodeBench": 68,
        "humanEval": 84,
        "bigCodeBench": 50,
        "strengths": [
          "Code-specialized",
          "Fast"
        ],
        "weaknesses": [
          "New model"
        ],
        "pricePerScorePoint": 0.077
      },
      {
        "modelId": "llama-3.3-70b",
        "modelName": "Llama 3.3 70B",
        "provider": "Meta",
        "overallScore": 52,
        "sweBench": 44,
        "liveCodeBench": 54,
        "humanEval": 76,
        "bigCodeBench": 38,
        "strengths": [
          "Open source",
          "Self-hostable"
        ],
        "weaknesses": [
          "Requires own infrastructure"
        ],
        "pricePerScorePoint": 0.004
      },
      {
        "modelId": "phi-4",
        "modelName": "Microsoft Phi-4",
        "provider": "Microsoft",
        "overallScore": 45,
        "sweBench": 38,
        "liveCodeBench": 46,
        "humanEval": 68,
        "bigCodeBench": 30,
        "strengths": [
          "Small model, runs locally"
        ],
        "weaknesses": [
          "Limited capacity"
        ],
        "pricePerScorePoint": 0.002
      },
      {
        "modelId": "reka-flash",
        "modelName": "Reka Flash",
        "provider": "Reka",
        "overallScore": 40,
        "sweBench": 32,
        "liveCodeBench": 42,
        "humanEval": 62,
        "bigCodeBench": 26,
        "strengths": [
          "Multimodal",
          "Fast"
        ],
        "weaknesses": [
          "Basic coding",
          "Niche provider"
        ],
        "pricePerScorePoint": 0.025
      }
    ]
  },
  "lastUpdated": "2026-04-17T19:44:59.106Z"
}