openai batch refactor

2026-05-06 13:53:50 -04:00
parent 946aeac7c8
commit 64a7a18721
5 changed files with 833 additions and 312 deletions
--- a/tests/analysis_gpt4o_batch.py
+++ b/tests/analysis_gpt4o_batch.py
@@ -75,9 +75,24 @@ ANALYZED_AT = "2026-05-05T18:00:00+00:00"
 RUN_ID = "test-run-id-123"
 MODEL = "gpt-4o"

+# Minimal status.json for testing job logic
+def _make_status(jobs_override=None):
+    jobs = jobs_override or [
+        {"job_num": 1, "run_id": "r1", "status": "pending", "batch_id": None,
+         "records_submitted": 60, "records_completed": None, "records_failed": None,
+         "submitted_at": None, "completed_at": None},
+    ]
+    return {
+        "model": "gpt-4o-mini", "prompt_hash": "abc1234",
+        "input_file": "output/f452.jsonl", "input_sha256": "sha",
+        "total_comments": 100, "input_tokens": 50_000,
+        "est_queue_days": 0.025, "cost_$": 0.01,
+        "total_jobs": len(jobs), "jobs": jobs,
+    }
+

 # ---------------------------------------------------------------------------
-# Prompt versioning (batch reads the same prompt file)
+# Prompt versioning

 def test_prompt_version_is_7_hex_chars():
    assert len(bt.PROMPT_VERSION) == 7
@@ -206,52 +221,6 @@ def test_normalize_unknown_comment_id():
    assert record["input_title"] == ""


-# ---------------------------------------------------------------------------
-# Manifest
-
-def test_make_manifest_all_keys():
-    m = bt.make_manifest(
-        run_id=RUN_ID,
-        input_filename="output/forum452.jsonl",
-        input_sha256="abc123",
-        model="gpt-4o",
-        batch_id="batch_xyz",
-        records_submitted=100,
-        request_filename="analysis/gpt4o/requests/test-run-id-123.jsonl",
-    )
-    required = {
-        "run_id", "input_filename", "input_sha256", "prompt_hash", "model",
-        "batch_id", "records_submitted", "records_completed", "records_failed",
-        "request_filename", "raw_output_filename", "normalized_output_filename",
-        "created_at", "completed_at",
-    }
-    assert required == set(m.keys())
-
-
-def test_make_manifest_initial_nulls():
-    m = bt.make_manifest(
-        run_id=RUN_ID, input_filename="f", input_sha256="s",
-        model="gpt-4o", batch_id="b", records_submitted=10, request_filename="r",
-    )
-    assert m["records_completed"] is None
-    assert m["records_failed"] is None
-    assert m["raw_output_filename"] is None
-    assert m["normalized_output_filename"] is None
-    assert m["completed_at"] is None
-    assert m["prompt_hash"] == bt.PROMPT_VERSION
-
-
-def test_manifest_save_load_roundtrip(tmp_path, monkeypatch):
-    monkeypatch.setattr(bt, "RUNS_DIR", tmp_path)
-    m = bt.make_manifest(
-        run_id=RUN_ID, input_filename="f", input_sha256="s",
-        model="gpt-4o", batch_id="b", records_submitted=42, request_filename="r",
-    )
-    bt.save_manifest(m)
-    loaded = bt.load_manifest(RUN_ID)
-    assert loaded == m
-
-
 # ---------------------------------------------------------------------------
 # estimate_tokens

@@ -309,3 +278,112 @@ def test_chunk_preserves_all_comments(monkeypatch):
 def test_model_limits_has_required_models():
    for model in ("gpt-4o", "gpt-4o-mini", "gpt-5.4", "gpt-5.4-mini", "gpt-o4-mini"):
        assert model in bt.MODEL_LIMITS, f"{model} missing from MODEL_LIMITS"
+
+
+# ---------------------------------------------------------------------------
+# status.json helpers
+
+def test_status_save_load_roundtrip(tmp_path):
+    status = _make_status()
+    bt.save_status(status, tmp_path)
+    loaded = bt.load_status(tmp_path)
+    assert loaded == status
+
+
+# ---------------------------------------------------------------------------
+# _find_next_eligible_job
+
+def test_find_next_eligible_job_first_job_pending():
+    jobs = _make_status()["jobs"]
+    target, warning = bt._find_next_eligible_job(jobs)
+    assert target["job_num"] == 1
+    assert warning is None
+
+
+def test_find_next_eligible_job_after_completed():
+    jobs = [
+        {"job_num": 1, "status": "completed", "batch_id": "b1",
+         "records_submitted": 60, "records_completed": 60, "records_failed": 0,
+         "submitted_at": "t", "completed_at": "t", "run_id": "r1"},
+        {"job_num": 2, "status": "pending", "batch_id": None,
+         "records_submitted": 40, "records_completed": None, "records_failed": None,
+         "submitted_at": None, "completed_at": None, "run_id": "r2"},
+    ]
+    target, warning = bt._find_next_eligible_job(jobs)
+    assert target["job_num"] == 2
+    assert warning is None
+
+
+def test_find_next_eligible_job_blocked_by_in_progress():
+    jobs = [
+        {"job_num": 1, "status": "in_progress", "batch_id": "b1",
+         "records_submitted": 60, "records_completed": None, "records_failed": None,
+         "submitted_at": "t", "completed_at": None, "run_id": "r1"},
+        {"job_num": 2, "status": "pending", "batch_id": None,
+         "records_submitted": 40, "records_completed": None, "records_failed": None,
+         "submitted_at": None, "completed_at": None, "run_id": "r2"},
+    ]
+    target, warning = bt._find_next_eligible_job(jobs)
+    assert target is None
+    assert warning is not None
+    assert "in_progress" in warning
+
+
+def test_find_next_eligible_job_all_completed():
+    jobs = [
+        {"job_num": 1, "status": "completed", "batch_id": "b1",
+         "records_submitted": 60, "records_completed": 60, "records_failed": 0,
+         "submitted_at": "t", "completed_at": "t", "run_id": "r1"},
+    ]
+    target, warning = bt._find_next_eligible_job(jobs)
+    assert target is None
+    assert warning is None
+
+
+def test_resume_from_status_json(tmp_path):
+    """Reload a status.json with one completed job and find the next pending job."""
+    jobs = [
+        {"job_num": 1, "run_id": "r1", "status": "completed", "batch_id": "b1",
+         "records_submitted": 60, "records_completed": 58, "records_failed": 2,
+         "submitted_at": "2026-05-06T10:00:00+00:00", "completed_at": "2026-05-06T11:00:00+00:00"},
+        {"job_num": 2, "run_id": "r2", "status": "pending", "batch_id": None,
+         "records_submitted": 40, "records_completed": None, "records_failed": None,
+         "submitted_at": None, "completed_at": None},
+    ]
+    bt.save_status(_make_status(jobs), tmp_path)
+    loaded = bt.load_status(tmp_path)
+    target, warning = bt._find_next_eligible_job(loaded["jobs"])
+    assert target["job_num"] == 2
+    assert warning is None
+
+
+# ---------------------------------------------------------------------------
+# normalize: out-of-order and duplicate custom_id
+
+def test_out_of_order_output_reconciled_by_custom_id():
+    """Raw lines processed in any order are mapped to the correct comment."""
+    c2 = {**COMMENT_ITEM, "comment_id": "99999", "title": "Second comment"}
+    lookup = {COMMENT_ITEM["comment_id"]: COMMENT_ITEM, "99999": c2}
+
+    line_for_99999 = {
+        **RAW_SUCCESS_LINE,
+        "custom_id": "comment_99999",
+    }
+    line_for_87914 = RAW_SUCCESS_LINE
+
+    r1 = bt.normalize_output_line(line_for_99999, lookup, RUN_ID, ANALYZED_AT, MODEL, bt.PROMPT_VERSION)
+    r2 = bt.normalize_output_line(line_for_87914, lookup, RUN_ID, ANALYZED_AT, MODEL, bt.PROMPT_VERSION)
+
+    assert r1["comment_id"] == "99999"
+    assert r1["input_title"] == "Second comment"
+    assert r2["comment_id"] == "87914"
+    assert r2["input_title"] == COMMENT_ITEM["title"]
+
+
+def test_duplicate_custom_id_both_produce_valid_records():
+    """Two raw lines with the same custom_id each produce a valid record."""
+    r1 = bt.normalize_output_line(RAW_SUCCESS_LINE, COMMENT_LOOKUP, RUN_ID, ANALYZED_AT, MODEL, bt.PROMPT_VERSION)
+    r2 = bt.normalize_output_line(RAW_SUCCESS_LINE, COMMENT_LOOKUP, RUN_ID, ANALYZED_AT, MODEL, bt.PROMPT_VERSION)
+    assert r1["comment_id"] == r2["comment_id"] == "87914"
+    assert r1["error"] is None
+    assert r2["error"] is None
--- a/tests/tokenizer.py
+++ b/tests/tokenizer.py
@@ -0,0 +1,201 @@
+"""Unit tests for analysis/gpt4o/tokenizer.py — no real API calls."""
+
+import io
+import json
+import math
+import sys
+from pathlib import Path
+from unittest.mock import patch
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).parent.parent / "analysis" / "gpt4o"))
+import tokenizer as tk
+import analysis_batch as ab
+
+
+# ---------------------------------------------------------------------------
+# Fixtures
+
+FORUM_ITEM = {
+    "forum_id": "452",
+    "reg_title": "Model Policies for Transgender Students",
+    "reg_desc": "Guidance developed in response to HB 145.",
+}
+
+COMMENT_A = {
+    "forum_id": "452",
+    "comment_id": "100",
+    "author": "Alice",
+    "date": "2021-01-04T09:15:00",
+    "title": "Support",
+    "text": "I support this policy.",
+}
+
+COMMENT_B = {
+    "forum_id": "452",
+    "comment_id": "101",
+    "author": "Bob",
+    "date": "2021-01-05T10:00:00",
+    "title": "Oppose",
+    "text": "I oppose this policy.",
+}
+
+COMMENTS = [COMMENT_A, COMMENT_B]
+PROMPT_HASH = "abc1234"
+INPUT_FILE = "output/f452.jsonl"
+INPUT_SHA256 = "deadbeef" * 8
+PROMPT_FILE = "analysis/prompt-1.txt"
+
+
+def _make_report(total_tokens=10_000):
+    return tk.compute_report(
+        COMMENTS, FORUM_ITEM, PROMPT_HASH, INPUT_FILE, INPUT_SHA256, PROMPT_FILE
+    )
+
+
+# ---------------------------------------------------------------------------
+# compute_report: required top-level keys
+
+def test_report_has_top_level_keys():
+    report = _make_report()
+    required = {"prompt", "prompt_hash", "input_file", "input_sha256",
+                "total_comments", "input_tokens"}
+    assert required.issubset(set(report.keys()))
+
+
+def test_report_metadata_values():
+    report = _make_report()
+    assert report["prompt"] == PROMPT_FILE
+    assert report["prompt_hash"] == PROMPT_HASH
+    assert report["input_file"] == INPUT_FILE
+    assert report["input_sha256"] == INPUT_SHA256
+    assert report["total_comments"] == 2
+
+
+def test_report_input_tokens_positive():
+    report = _make_report()
+    assert isinstance(report["input_tokens"], int)
+    assert report["input_tokens"] > 0
+
+
+# ---------------------------------------------------------------------------
+# compute_report: per-model entries
+
+def test_report_has_per_model_keys():
+    report = _make_report()
+    for model in ab.MODEL_LIMITS:
+        assert model in report, f"Model {model} missing from report"
+        assert isinstance(report[model], dict)
+
+
+def test_report_per_model_has_required_fields():
+    report = _make_report()
+    for model in ab.MODEL_LIMITS:
+        m = report[model]
+        assert "jobs" in m
+        assert "cost_$" in m
+        assert "est_queue_days" in m
+
+
+def test_report_jobs_at_least_one():
+    report = _make_report()
+    for model in ab.MODEL_LIMITS:
+        assert report[model]["jobs"] >= 1
+
+
+# ---------------------------------------------------------------------------
+# compute_report: calculation accuracy
+
+def test_cost_calculation():
+    """cost_$ = total_tokens / 1M * pricing_rate"""
+    report = _make_report()
+    total = report["input_tokens"]
+    for model in ab.MODEL_LIMITS:
+        expected_cost = round(total / 1_000_000 * tk.MODEL_PRICING.get(model, 0.0), 4)
+        assert report[model]["cost_$"] == pytest.approx(expected_cost, abs=1e-6)
+
+
+def test_est_queue_days_calculation():
+    """est_queue_days = total_tokens / tpd (rounded to 2 decimal places)"""
+    report = _make_report()
+    total = report["input_tokens"]
+    for model, tpd in ab.MODEL_LIMITS.items():
+        expected = round(total / tpd, 2)
+        assert report[model]["est_queue_days"] == pytest.approx(expected, abs=1e-4)
+
+
+def test_jobs_ceiling_division():
+    """jobs = ceil(total_tokens / (tpd * _LIMIT_BUFFER))"""
+    report = _make_report()
+    total = report["input_tokens"]
+    for model, tpd in ab.MODEL_LIMITS.items():
+        effective = int(tpd * ab._LIMIT_BUFFER)
+        expected = math.ceil(total / effective)
+        assert report[model]["jobs"] == expected
+
+
+def test_more_comments_increases_tokens():
+    """More comments → more input_tokens."""
+    few = tk.compute_report([COMMENT_A], FORUM_ITEM, PROMPT_HASH, INPUT_FILE, INPUT_SHA256, PROMPT_FILE)
+    many = tk.compute_report(COMMENTS, FORUM_ITEM, PROMPT_HASH, INPUT_FILE, INPUT_SHA256, PROMPT_FILE)
+    assert many["input_tokens"] > few["input_tokens"]
+
+
+# ---------------------------------------------------------------------------
+# MODEL_PRICING coverage
+
+def test_model_pricing_has_required_models():
+    for model in ("gpt-4o", "gpt-4o-mini", "gpt-5.4", "gpt-5.4-mini", "gpt-o4-mini"):
+        assert model in tk.MODEL_PRICING, f"{model} missing from MODEL_PRICING"
+
+
+def test_model_pricing_values_positive():
+    for model, price in tk.MODEL_PRICING.items():
+        assert price > 0, f"{model} has non-positive price"
+
+
+# ---------------------------------------------------------------------------
+# print_table: runs without error, produces output
+
+def test_print_table_runs():
+    report = _make_report()
+    buf = io.StringIO()
+    with patch("sys.stdout", buf):
+        tk.print_table(report)
+    output = buf.getvalue()
+    assert "gpt-4o" in output
+    assert "gpt-4o-mini" in output
+
+
+def test_print_table_shows_all_models():
+    report = _make_report()
+    buf = io.StringIO()
+    with patch("sys.stdout", buf):
+        tk.print_table(report)
+    output = buf.getvalue()
+    for model in ab.MODEL_LIMITS:
+        assert model in output, f"{model} not shown in print_table output"
+
+
+def test_print_table_highlights_recommended():
+    """When a single-job cheapest model exists, table marks it as recommended."""
+    report = _make_report()
+    buf = io.StringIO()
+    with patch("sys.stdout", buf):
+        tk.print_table(report)
+    output = buf.getvalue()
+    assert "recommended" in output
+
+
+# ---------------------------------------------------------------------------
+# report.json round-trip (write → read)
+
+def test_report_json_roundtrip(tmp_path):
+    report = _make_report()
+    out = tmp_path / "report.json"
+    out.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
+    loaded = json.loads(out.read_text(encoding="utf-8"))
+    assert loaded["total_comments"] == report["total_comments"]
+    assert loaded["input_tokens"] == report["input_tokens"]
+    assert loaded["gpt-4o-mini"]["jobs"] == report["gpt-4o-mini"]["jobs"]