"""
V3 Pre-flight: minimal verification that EXP-081c source-grounding effect
still holds on current (2026-05-01) models.

The claim under test: source-present produces dramatically more sourced
numbers than source-absent on the SAME prompt. This isolates source
material as the dominant variable, larger than any prompt-engineering
intervention.

Design:
  1 topic (remote_work) x 2 conditions (source_present, source_absent)
  x 2 versions x 3 generators (grok-4-1-fast, gemini-3-flash-preview,
  gpt-5-mini)
  = 12 documents total.

Measurement: programmatic number matching (zero LLM judgment).

Pass criterion: source-present source-match rate is >=30pp higher than
source-absent on at least 2 of 3 generators. (Original 081c: ~46pp
average across topics. Allow drift / single-topic variance.)
"""

import json
import os
import sys
import time

sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from _config import get_openai_client, call_generator

THIS_DIR = os.path.dirname(os.path.abspath(__file__))
sys.path.insert(0, THIS_DIR)
from exp081_prompt_arch import TOPICS, analyze_numbers, load_source
from exp081_commensurable_bridge import (
    t3_prompt_with_source,
    t3_prompt_without_source,
)

GENERATORS = ["grok-4-1-fast", "gemini-3-flash-preview", "gpt-5-mini"]
TOPIC_KEY = "remote_work"
VERSIONS = 2


def run():
    cfg = TOPICS[TOPIC_KEY]
    source = load_source(cfg["source_file"])
    client = get_openai_client()
    print(f"Topic: {TOPIC_KEY}")
    print(f"Source: {len(source)} chars")
    print(f"Generators: {GENERATORS}")
    print(f"Conditions: source_present, source_absent")
    print(f"Versions per cell: {VERSIONS}")
    print(f"Total generations: {len(GENERATORS) * 2 * VERSIONS}")
    print()

    results = []

    for model in GENERATORS:
        for cond_name, build_prompt in [
            (
                "SOURCE_PRESENT",
                lambda: t3_prompt_with_source(
                    source,
                    cfg["topic"],
                    cfg["judge_role"],
                    cfg["judge_axis"],
                    cfg["judge_cares"],
                ),
            ),
            (
                "SOURCE_ABSENT",
                lambda: t3_prompt_without_source(
                    cfg["topic"],
                    cfg["judge_role"],
                    cfg["judge_axis"],
                    cfg["judge_cares"],
                ),
            ),
        ]:
            for v in range(VERSIONS):
                print(
                    f"  {model:30s} {cond_name:18s} v{v+1}...",
                    end=" ",
                    flush=True,
                )
                t0 = time.time()
                prompt = build_prompt()
                text = call_generator(
                    client,
                    model,
                    [{"role": "user", "content": prompt}],
                )
                elapsed = time.time() - t0
                if text is None:
                    print(f"FAILED ({elapsed:.1f}s)")
                    continue

                analysis = analyze_numbers(text, source)
                in_source = analysis["in_source"]
                total = analysis["total_numbers"]
                source_rate = in_source / total if total > 0 else 0
                print(
                    f"{total:3d} numbers, "
                    f"{in_source:2d} sourced "
                    f"({source_rate*100:.1f}%) "
                    f"[{elapsed:.1f}s]"
                )
                results.append({
                    "model": model,
                    "condition": cond_name,
                    "version": v + 1,
                    "topic": TOPIC_KEY,
                    "doc_text": text,
                    "analysis": analysis,
                    "elapsed_s": round(elapsed, 1),
                })

    # Aggregate
    print()
    print("=" * 70)
    print("AGGREGATE")
    print("=" * 70)

    summary = {}
    for model in GENERATORS:
        for cond in ["SOURCE_PRESENT", "SOURCE_ABSENT"]:
            cell = [r for r in results
                    if r["model"] == model and r["condition"] == cond]
            total_nums = sum(r["analysis"]["total_numbers"] for r in cell)
            total_sourced = sum(r["analysis"]["in_source"] for r in cell)
            rate = total_sourced / total_nums if total_nums > 0 else 0
            summary[(model, cond)] = {
                "total_numbers": total_nums,
                "in_source": total_sourced,
                "source_match_rate": round(rate, 4),
                "n_docs": len(cell),
            }
            print(
                f"  {model:30s} {cond:18s} "
                f"{total_nums:3d} nums, {total_sourced:2d} sourced, "
                f"{rate*100:.1f}%"
            )

    # Source-grounding effect (pp delta)
    print()
    print("SOURCE-GROUNDING EFFECT (source-present vs source-absent):")
    pass_count = 0
    for model in GENERATORS:
        present = summary.get((model, "SOURCE_PRESENT"), {}).get(
            "source_match_rate", 0)
        absent = summary.get((model, "SOURCE_ABSENT"), {}).get(
            "source_match_rate", 0)
        delta_pp = (present - absent) * 100
        verdict = "PASS" if delta_pp >= 30.0 else "FAIL"
        if delta_pp >= 30.0:
            pass_count += 1
        print(
            f"  {model:30s} {present*100:.1f}% -> {absent*100:.1f}% "
            f"= {delta_pp:+.1f}pp  [{verdict}]"
        )

    print()
    overall = "PASS" if pass_count >= 2 else "FAIL"
    print(f"Overall: {pass_count}/{len(GENERATORS)} generators passed  [{overall}]")

    # Save
    out_path = os.path.join(
        THIS_DIR, "preflight_v3_results.json"
    )
    with open(out_path, "w") as f:
        json.dump({
            "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
            "design": {
                "topic": TOPIC_KEY,
                "generators": GENERATORS,
                "conditions": ["SOURCE_PRESENT", "SOURCE_ABSENT"],
                "versions_per_cell": VERSIONS,
            },
            "summary": {
                f"{k[0]}|{k[1]}": v for k, v in summary.items()
            },
            "results": results,
        }, f, indent=2)
    print()
    print(f"Saved: {out_path}")


if __name__ == "__main__":
    run()