{
  "benchmarkWeek": "local-openai-gpt-5-nano-full-judge-20260530",
  "benchmarkVersion": "2026-q1",
  "taskCount": 15,
  "repeatCount": 2,
  "domains": [
    "cross_functional",
    "design",
    "engineering",
    "marketing",
    "ops",
    "product",
    "sales"
  ],
  "providers": [
    "openai"
  ],
  "models": [
    "gpt-5-nano",
    "gpt-5-mini",
    "gpt-5.1"
  ],
  "runtimes": [
    "node-24.13.0"
  ],
  "claims": [
    "Local public-catalog run using OpenAI Responses API.",
    "Artifacts were scored by independent OpenAI judge calls that did not generate the artifact.",
    "Public quality scores use median criterion scores and flag material judge disagreement for human review."
  ],
  "generationMethod": {
    "provider": "openai",
    "model": "gpt-5-nano",
    "reasoningEffort": "minimal",
    "maxOutputTokens": 3000
  },
  "evaluationMethod": {
    "independentJudges": true,
    "judgePanel": [
      {
        "provider": "openai",
        "model": "gpt-5-nano",
        "reasoningEffort": "low"
      },
      {
        "provider": "openai",
        "model": "gpt-5-mini",
        "reasoningEffort": "medium"
      },
      {
        "provider": "openai",
        "model": "gpt-5.1",
        "reasoningEffort": "high"
      }
    ],
    "judgeMaxOutputTokens": 2500,
    "scoreAggregation": "median criterion score across completed independent judges",
    "disagreementThresholdPoints": 8,
    "humanReviewPolicy": "flag if any judge recommends review, any judge fails, or judge disagreement exceeds threshold"
  },
  "tokenUsage": {
    "generation": {
      "inputTokens": 14922,
      "cachedInputTokens": 0,
      "outputTokens": 39282,
      "reasoningTokens": 0,
      "totalTokens": 54204,
      "costCents": 1.6459
    },
    "judging": {
      "inputTokens": 146680,
      "cachedInputTokens": 0,
      "outputTokens": 83289,
      "reasoningTokens": 57077,
      "totalTokens": 229969,
      "costCents": 51.3439
    },
    "total": {
      "inputTokens": 161602,
      "cachedInputTokens": 0,
      "outputTokens": 122571,
      "reasoningTokens": 57077,
      "totalTokens": 284173,
      "costCents": 52.9898
    }
  },
  "assumptions": {
    "modelSelection": "gpt-5-nano was selected for the cheapest complete OpenAI smoke run.",
    "reasoningEffort": "minimal",
    "maxOutputTokens": 3000,
    "judgeProtocol": "Artifacts were scored by independent judge calls that did not generate the artifact."
  },
  "dataFiles": [
    "summary.json",
    "metadata.json",
    "examples.json",
    "tasks.json",
    "scorecard.csv",
    "judgments.json"
  ]
}