"""Recompute the Gemini Flash specificity effect from data.json. Standard library only. Run: python3 script.py Reproduces the specificity effect at density (markers per 1k words) for the Gemini Flash 2x2, and prints the raw-score effect to show the length confound that density normalization resolves. """ import json import math from statistics import mean, stdev def hedges(a, b): na, nb = len(a), len(b) sp = math.sqrt(((na - 1) * stdev(a) ** 2 + (nb - 1) * stdev(b) ** 2) / (na + nb - 2)) d = (mean(a) - mean(b)) / sp g = d * (1 - 3 / (4 * (na + nb) - 9)) return d, g def pull(ro, cells, key): return [r["score"][key] for c in cells for r in ro[c]] def main(): ro = json.load(open("data.json"))["raw_outputs"] on, off = ["SPEC_QUAL", "SPEC_ONLY"], ["QUAL_ONLY", "BARE"] for label, key in [("density (per 1k words)", "density_per_1kw"), ("raw total", "total")]: d, g = hedges(pull(ro, on, key), pull(ro, off, key)) print(f"specificity, {label:22}: Cohen d = {d:+.3f}, Hedges g = {g:+.3f}") print("\nDensity is the clean measure (d~1.67). Raw is confounded by length " "(quality demands lengthen outputs), which is why the raw specificity " "effect looks smaller.") if __name__ == "__main__": main()