{
  "feature": "document-summarizer",
  "description": "Golden dataset + scorer results for a doc-summarization feature. Fictional but internally consistent.",
  "version": "v3",
  "generated_at": "2026-04-12T14:22:00Z",
  "model_under_test": "claude-sonnet-4-6",
  "prompt_version": "summarize_doc_v3.xml",
  "runs": {
    "total": 40,
    "passed": 37,
    "failed": 3,
    "pass_rate": 0.925
  },
  "scorers": [
    {
      "name": "contains_all_key_points",
      "type": "llm_as_judge",
      "weight": 0.4,
      "description": "Does the summary include every key point from the expected_key_points list?"
    },
    {
      "name": "no_hallucinated_claims",
      "type": "llm_as_judge",
      "weight": 0.3,
      "description": "Are all claims in the summary supported by the source document?"
    },
    {
      "name": "length_within_bounds",
      "type": "regex",
      "weight": 0.15,
      "description": "Is the summary between 50 and 250 words?"
    },
    {
      "name": "format_valid_json",
      "type": "structured",
      "weight": 0.15,
      "description": "Does the response parse as JSON matching the expected schema?"
    }
  ],
  "regression_vs_v2": {
    "v2_pass_rate": 0.875,
    "v3_pass_rate": 0.925,
    "delta": 0.05,
    "verdict": "improvement"
  },
  "examples": [
    {
      "id": "e_001",
      "input_doc_title": "Q2 engineering all-hands notes",
      "input_length_tokens": 4820,
      "expected_key_points": [
        "Authentication rewrite scheduled for Q3",
        "Observability budget approved at $24k/year",
        "Two senior hires planned, offers out"
      ],
      "output": "The Q2 all-hands covered three main threads. Engineering is scheduling an authentication rewrite for Q3, replacing the legacy session system with WebAuthn-capable infrastructure. An observability budget of $24,000 annually was approved, funding a combination of Sentry, PostHog, and Grafana Cloud. Two senior engineering hires are in progress with offers extended. Secondary topics included the incident retrospective process (now owned by VP Eng) and a renewed commitment to shipping one small user-facing improvement per sprint.",
      "scores": {
        "contains_all_key_points": 1.0,
        "no_hallucinated_claims": 1.0,
        "length_within_bounds": 1.0,
        "format_valid_json": 1.0
      },
      "weighted_score": 1.0,
      "pass": true
    },
    {
      "id": "e_002",
      "input_doc_title": "Sales call with Plumbline (enterprise prospect)",
      "input_length_tokens": 6400,
      "expected_key_points": [
        "Budget is confirmed at $60k-$80k annual",
        "Decision timeline: end of Q3",
        "Blocker: SOC 2 Type II certification",
        "Champion: VP Engineering (Amelia)"
      ],
      "output": "Plumbline is evaluating us for a 2026 rollout. Budget is confirmed in the $60k to $80k annual range. Decision expected by end of Q3. The primary blocker is SOC 2 Type II certification, which Amelia (VP Eng, internal champion) needs before she can greenlight the contract. Amelia has offered to act as a reference once the certification lands.",
      "scores": {
        "contains_all_key_points": 1.0,
        "no_hallucinated_claims": 1.0,
        "length_within_bounds": 1.0,
        "format_valid_json": 1.0
      },
      "weighted_score": 1.0,
      "pass": true
    },
    {
      "id": "e_003",
      "input_doc_title": "Contract negotiation email thread",
      "input_length_tokens": 3200,
      "expected_key_points": [
        "Liability cap requested: 2x annual fees",
        "Data processing terms aligned",
        "MSA signed; SOW pending"
      ],
      "output": "The contract negotiation is mostly resolved. The client requested a liability cap at 2x annual fees, which we accepted. Data processing terms are aligned with their GDPR addendum. The MSA is signed; the SOW is pending final scope confirmation, expected early next week.",
      "scores": {
        "contains_all_key_points": 1.0,
        "no_hallucinated_claims": 1.0,
        "length_within_bounds": 1.0,
        "format_valid_json": 1.0
      },
      "weighted_score": 1.0,
      "pass": true
    },
    {
      "id": "e_004",
      "input_doc_title": "Incident post-mortem: 2026-03-14 API outage",
      "input_length_tokens": 8900,
      "expected_key_points": [
        "Root cause: connection pool exhaustion from retry storm",
        "Duration: 47 minutes",
        "Customers affected: 120 (approx 8 percent of active)",
        "Mitigation shipped: circuit breaker on upstream calls"
      ],
      "output": "The 2026-03-14 API outage lasted 47 minutes and affected approximately 120 customers (8 percent of active accounts). Root cause was connection pool exhaustion triggered by a retry storm against a failing upstream dependency. A circuit breaker has been shipped and tested to prevent recurrence. Follow-up work includes alerting on pool saturation at 80 percent and documenting the dependency topology.",
      "scores": {
        "contains_all_key_points": 1.0,
        "no_hallucinated_claims": 1.0,
        "length_within_bounds": 1.0,
        "format_valid_json": 1.0
      },
      "weighted_score": 1.0,
      "pass": true
    },
    {
      "id": "e_005",
      "input_doc_title": "RFC: switching from REST to GraphQL",
      "input_length_tokens": 5100,
      "expected_key_points": [
        "Decision: adopt GraphQL for new endpoints only",
        "Existing REST endpoints: maintained, not migrated",
        "Timeline: first GraphQL endpoints in Q3",
        "Owner: platform team"
      ],
      "output": "The RFC recommends adopting GraphQL for new endpoints while maintaining existing REST endpoints in place. No bulk migration is planned. First GraphQL endpoints scheduled for Q3, owned by the platform team. The deciding factors were client-side flexibility (mobile team) and reduction in overfetching on the dashboard.",
      "scores": {
        "contains_all_key_points": 1.0,
        "no_hallucinated_claims": 1.0,
        "length_within_bounds": 1.0,
        "format_valid_json": 1.0
      },
      "weighted_score": 1.0,
      "pass": true
    },
    {
      "id": "e_006",
      "input_doc_title": "Product roadmap Q3 planning doc",
      "input_length_tokens": 7400,
      "expected_key_points": [
        "Q3 theme: reliability and observability",
        "Three shipping commitments: uptime SLO, incident dashboard, synthetic monitoring",
        "Not shipping in Q3: mobile app, enterprise SSO",
        "Budget: 3 engineers full-time for quarter"
      ],
      "output": "Q3 is themed around reliability and observability. Three commitments for the quarter: uptime SLO with public status page, internal incident dashboard, and synthetic monitoring across top customer flows. Deliberately deferred: mobile app and enterprise SSO. Staffing is three engineers full-time.",
      "scores": {
        "contains_all_key_points": 1.0,
        "no_hallucinated_claims": 1.0,
        "length_within_bounds": 1.0,
        "format_valid_json": 1.0
      },
      "weighted_score": 1.0,
      "pass": true
    },
    {
      "id": "e_007",
      "input_doc_title": "Customer support macro review",
      "input_length_tokens": 2800,
      "expected_key_points": [
        "Top 3 macros: password reset, billing question, export data",
        "Combined these are 68 percent of tickets",
        "Self-serve proposed for password reset"
      ],
      "output": "Support reviewed the top three ticket macros: password reset, billing questions, and data export requests. Together these account for 68 percent of all tickets. A self-serve password reset flow is proposed to reduce volume.",
      "scores": {
        "contains_all_key_points": 1.0,
        "no_hallucinated_claims": 1.0,
        "length_within_bounds": 1.0,
        "format_valid_json": 1.0
      },
      "weighted_score": 1.0,
      "pass": true
    },
    {
      "id": "e_fail_01",
      "input_doc_title": "Board deck: 2026 operating plan",
      "input_length_tokens": 12400,
      "expected_key_points": [
        "ARR target: $12M by year-end",
        "Hiring plan: 8 engineering, 4 GTM",
        "Runway: 22 months",
        "Key risk: enterprise sales cycle"
      ],
      "output": "The 2026 operating plan targets $12M ARR by year-end with a hiring plan of 8 engineering and 4 go-to-market hires. The company's runway stands at 22 months. Key strategic risks discussed include the lengthening enterprise sales cycle and increased competitive pressure from a well-funded newcomer in the space. The board approved the plan with minor adjustments to the marketing budget.",
      "scores": {
        "contains_all_key_points": 1.0,
        "no_hallucinated_claims": 0.5,
        "length_within_bounds": 1.0,
        "format_valid_json": 1.0
      },
      "weighted_score": 0.85,
      "pass": false,
      "failure_reason": "Hallucinated 'well-funded newcomer' and 'board approved with minor adjustments to marketing budget' - these claims are not in the source document.",
      "action": "Prompt v4: strengthen no-hallucination instruction. Add explicit 'do not add context not present in source' to system prompt."
    },
    {
      "id": "e_fail_02",
      "input_doc_title": "Architecture review: event sourcing proposal",
      "input_length_tokens": 9800,
      "expected_key_points": [
        "Proposal: event sourcing for audit log only",
        "Not adopting for general state",
        "Concerns raised: operational complexity, query patterns",
        "Decision: prototype, 6-week spike"
      ],
      "output": "The architecture review covered a proposal to adopt event sourcing. After discussion, the team decided to adopt it system-wide starting Q3 with a migration plan over two quarters.",
      "scores": {
        "contains_all_key_points": 0.25,
        "no_hallucinated_claims": 0.0,
        "length_within_bounds": 0.6,
        "format_valid_json": 1.0
      },
      "weighted_score": 0.34,
      "pass": false,
      "failure_reason": "Major factual inversion. Source says 'event sourcing for audit log only, not general state'; summary says 'system-wide adoption'. Also missing concerns and prototype decision.",
      "action": "Add this case to the 'high-stakes inversion' test set. Consider adding a second pass where the model is asked to verify its summary against the source."
    },
    {
      "id": "e_fail_03",
      "input_doc_title": "Pricing page redesign brief",
      "input_length_tokens": 3400,
      "expected_key_points": [
        "New pricing: three tiers (Solo, Team, Enterprise)",
        "Annual discount: 15 percent",
        "Launch date: 2026-05-01",
        "Existing customers grandfathered for 12 months"
      ],
      "output": "{ \"summary\": \"Pricing redesign introduces three tiers - Solo, Team, Enterprise. Annual billing offers a 15 percent discount. Launch is 2026-05-01. Existing customers are grandfathered for 12 months on their current plan.\"",
      "scores": {
        "contains_all_key_points": 1.0,
        "no_hallucinated_claims": 1.0,
        "length_within_bounds": 1.0,
        "format_valid_json": 0.0
      },
      "weighted_score": 0.85,
      "pass": false,
      "failure_reason": "Valid summary content, but JSON is malformed (unclosed brace).",
      "action": "Add JSON validation + one-shot retry on parse failure in production code. Not a prompt issue; an infrastructure issue."
    }
  ],
  "notes": [
    "Remaining 30 passing examples omitted for brevity; the shape is the same.",
    "Weighted score = sum of (scorer_score * scorer_weight).",
    "Pass threshold = 0.85.",
    "e_fail_01 and e_fail_02 indicate prompt v3 still lets some hallucinations through. Prompt v4 in progress.",
    "e_fail_03 is a non-prompt issue; will be fixed in the API client, not the prompt."
  ]
}