""" V3 Pre-flight: minimal verification that EXP-081c source-grounding effect still holds on current (2026-05-01) models. The claim under test: source-present produces dramatically more sourced numbers than source-absent on the SAME prompt. This isolates source material as the dominant variable, larger than any prompt-engineering intervention. Design: 1 topic (remote_work) x 2 conditions (source_present, source_absent) x 2 versions x 3 generators (grok-4-1-fast, gemini-3-flash-preview, gpt-5-mini) = 12 documents total. Measurement: programmatic number matching (zero LLM judgment). Pass criterion: source-present source-match rate is >=30pp higher than source-absent on at least 2 of 3 generators. (Original 081c: ~46pp average across topics. Allow drift / single-topic variance.) """ import json import os import sys import time sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) from _config import get_openai_client, call_generator THIS_DIR = os.path.dirname(os.path.abspath(__file__)) sys.path.insert(0, THIS_DIR) from exp081_prompt_arch import TOPICS, analyze_numbers, load_source from exp081_commensurable_bridge import ( t3_prompt_with_source, t3_prompt_without_source, ) GENERATORS = ["grok-4-1-fast", "gemini-3-flash-preview", "gpt-5-mini"] TOPIC_KEY = "remote_work" VERSIONS = 2 def run(): cfg = TOPICS[TOPIC_KEY] source = load_source(cfg["source_file"]) client = get_openai_client() print(f"Topic: {TOPIC_KEY}") print(f"Source: {len(source)} chars") print(f"Generators: {GENERATORS}") print(f"Conditions: source_present, source_absent") print(f"Versions per cell: {VERSIONS}") print(f"Total generations: {len(GENERATORS) * 2 * VERSIONS}") print() results = [] for model in GENERATORS: for cond_name, build_prompt in [ ( "SOURCE_PRESENT", lambda: t3_prompt_with_source( source, cfg["topic"], cfg["judge_role"], cfg["judge_axis"], cfg["judge_cares"], ), ), ( "SOURCE_ABSENT", lambda: t3_prompt_without_source( cfg["topic"], cfg["judge_role"], cfg["judge_axis"], cfg["judge_cares"], ), ), ]: for v in range(VERSIONS): print( f" {model:30s} {cond_name:18s} v{v+1}...", end=" ", flush=True, ) t0 = time.time() prompt = build_prompt() text = call_generator( client, model, [{"role": "user", "content": prompt}], ) elapsed = time.time() - t0 if text is None: print(f"FAILED ({elapsed:.1f}s)") continue analysis = analyze_numbers(text, source) in_source = analysis["in_source"] total = analysis["total_numbers"] source_rate = in_source / total if total > 0 else 0 print( f"{total:3d} numbers, " f"{in_source:2d} sourced " f"({source_rate*100:.1f}%) " f"[{elapsed:.1f}s]" ) results.append({ "model": model, "condition": cond_name, "version": v + 1, "topic": TOPIC_KEY, "doc_text": text, "analysis": analysis, "elapsed_s": round(elapsed, 1), }) # Aggregate print() print("=" * 70) print("AGGREGATE") print("=" * 70) summary = {} for model in GENERATORS: for cond in ["SOURCE_PRESENT", "SOURCE_ABSENT"]: cell = [r for r in results if r["model"] == model and r["condition"] == cond] total_nums = sum(r["analysis"]["total_numbers"] for r in cell) total_sourced = sum(r["analysis"]["in_source"] for r in cell) rate = total_sourced / total_nums if total_nums > 0 else 0 summary[(model, cond)] = { "total_numbers": total_nums, "in_source": total_sourced, "source_match_rate": round(rate, 4), "n_docs": len(cell), } print( f" {model:30s} {cond:18s} " f"{total_nums:3d} nums, {total_sourced:2d} sourced, " f"{rate*100:.1f}%" ) # Source-grounding effect (pp delta) print() print("SOURCE-GROUNDING EFFECT (source-present vs source-absent):") pass_count = 0 for model in GENERATORS: present = summary.get((model, "SOURCE_PRESENT"), {}).get( "source_match_rate", 0) absent = summary.get((model, "SOURCE_ABSENT"), {}).get( "source_match_rate", 0) delta_pp = (present - absent) * 100 verdict = "PASS" if delta_pp >= 30.0 else "FAIL" if delta_pp >= 30.0: pass_count += 1 print( f" {model:30s} {present*100:.1f}% -> {absent*100:.1f}% " f"= {delta_pp:+.1f}pp [{verdict}]" ) print() overall = "PASS" if pass_count >= 2 else "FAIL" print(f"Overall: {pass_count}/{len(GENERATORS)} generators passed [{overall}]") # Save out_path = os.path.join( THIS_DIR, "preflight_v3_results.json" ) with open(out_path, "w") as f: json.dump({ "timestamp": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()), "design": { "topic": TOPIC_KEY, "generators": GENERATORS, "conditions": ["SOURCE_PRESENT", "SOURCE_ABSENT"], "versions_per_cell": VERSIONS, }, "summary": { f"{k[0]}|{k[1]}": v for k, v in summary.items() }, "results": results, }, f, indent=2) print() print(f"Saved: {out_path}") if __name__ == "__main__": run()