openai batch refactor

This commit is contained in:
2026-05-06 13:53:50 -04:00
parent 946aeac7c8
commit 64a7a18721
5 changed files with 833 additions and 312 deletions

View File

@@ -1,27 +1,23 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
analysis_batch.py — OpenAI Batch API pipeline analysis_batch.py — OpenAI Batch API job runner
Commands (run manually in order): Run tokenizer.py first to generate report.json, then:
submit <input_jsonl> [--model gpt-4o] [--limit N] create <report.json> --model <model> — build job directory
— build request file, upload, create batch submit [--job N] [--dir DIR] — submit next eligible job
status [run_id] — check batch status, update manifest status [--job N] [--dir DIR] — check job status
download [run_id] — download + normalize output, update manifest download [--job N] [--dir DIR] — download + normalize completed jobs
run_id defaults to the most recent run in runs/ when omitted. DIR is a name under analysis/gpt4o/jobs/ (default: most recently created).
File layout (all under analysis/gpt4o/):
requests/<run_id>.jsonl — batch input sent to OpenAI
raw/<run_id>.jsonl — raw batch output from OpenAI
runs/<run_id>.json — run manifest
<run_id>_<model>.jsonl — normalized output (same schema as realtime)
""" """
import argparse import argparse
import hashlib import hashlib
import json import json
import os import os
import shutil
import sys import sys
import uuid
from datetime import datetime, timezone from datetime import datetime, timezone
from pathlib import Path from pathlib import Path
@@ -35,9 +31,8 @@ except ImportError:
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Model limits and token estimation # Model limits and token estimation
# Max enqueued tokens across ALL concurrent batches for this model # Max enqueued tokens across ALL concurrent batches (docs/openai.md, 2026-05-05).
# (docs/openai.md pricing table, updated 2026-05-05). # Org-tier limits may be lower; use --job to limit submission size if needed.
# NOTE: your org tier may be lower — if a submit fails, use --limit to reduce chunk size.
MODEL_LIMITS: dict[str, int] = { MODEL_LIMITS: dict[str, int] = {
"gpt-5.5": 900_000, "gpt-5.5": 900_000,
"gpt-5.4": 900_000, "gpt-5.4": 900_000,
@@ -48,8 +43,6 @@ MODEL_LIMITS: dict[str, int] = {
"gpt-o4-mini": 2_000_000, "gpt-o4-mini": 2_000_000,
} }
_DEFAULT_TOKEN_LIMIT = 900_000 _DEFAULT_TOKEN_LIMIT = 900_000
# tiktoken encoding per model family; unknown models fall back to o200k_base
_MODEL_ENCODING: dict[str, str] = { _MODEL_ENCODING: dict[str, str] = {
"gpt-5.5": "o200k_base", "gpt-5.5": "o200k_base",
"gpt-5.4": "o200k_base", "gpt-5.4": "o200k_base",
@@ -59,16 +52,11 @@ _MODEL_ENCODING: dict[str, str] = {
"gpt-4o-mini": "o200k_base", "gpt-4o-mini": "o200k_base",
"gpt-o4-mini": "o200k_base", "gpt-o4-mini": "o200k_base",
} }
# Leave 10% headroom below the published limit
_LIMIT_BUFFER = 0.90 _LIMIT_BUFFER = 0.90
def estimate_tokens(messages: list[dict], model: str) -> int: def estimate_tokens(messages: list[dict], model: str) -> int:
"""Estimate token count for a messages list. """Exact token count via tiktoken; falls back to chars/3 + 4 overhead per message."""
Uses tiktoken when available (exact for OpenAI models); falls back to
chars/3 + 4-token overhead per message for unknown/Anthropic models.
"""
try: try:
import tiktoken import tiktoken
enc = tiktoken.get_encoding(_MODEL_ENCODING.get(model, "o200k_base")) enc = tiktoken.get_encoding(_MODEL_ENCODING.get(model, "o200k_base"))
@@ -80,14 +68,11 @@ def estimate_tokens(messages: list[dict], model: str) -> int:
def chunk_comments_by_tokens( def chunk_comments_by_tokens(
comments: list[dict], forum: dict | None, model: str comments: list[dict], forum: dict | None, model: str
) -> list[list[dict]]: ) -> list[list[dict]]:
"""Split comments into chunks where each chunk fits under the model token limit.""" """Greedy bin-pack comments into chunks that fit under the model TPD limit."""
raw_limit = MODEL_LIMITS.get(model, _DEFAULT_TOKEN_LIMIT) token_limit = int(MODEL_LIMITS.get(model, _DEFAULT_TOKEN_LIMIT) * _LIMIT_BUFFER)
token_limit = int(raw_limit * _LIMIT_BUFFER)
chunks: list[list[dict]] = [] chunks: list[list[dict]] = []
current: list[dict] = [] current: list[dict] = []
current_tokens = 0 current_tokens = 0
for comment in comments: for comment in comments:
messages, _ = build_messages(comment, forum) messages, _ = build_messages(comment, forum)
tokens = estimate_tokens(messages, model) tokens = estimate_tokens(messages, model)
@@ -98,10 +83,8 @@ def chunk_comments_by_tokens(
else: else:
current.append(comment) current.append(comment)
current_tokens += tokens current_tokens += tokens
if current: if current:
chunks.append(current) chunks.append(current)
return chunks return chunks
@@ -114,11 +97,11 @@ PROMPT_VERSION = hashlib.sha256(SYSTEM_PROMPT.encode("utf-8")).hexdigest()[:7]
def _load_prompt(path: Path) -> None: def _load_prompt(path: Path) -> None:
"""Re-read a prompt file, updating module-level SYSTEM_PROMPT and PROMPT_VERSION."""
global SYSTEM_PROMPT, PROMPT_VERSION global SYSTEM_PROMPT, PROMPT_VERSION
SYSTEM_PROMPT = path.read_text(encoding="utf-8").strip() SYSTEM_PROMPT = path.read_text(encoding="utf-8").strip()
PROMPT_VERSION = hashlib.sha256(SYSTEM_PROMPT.encode("utf-8")).hexdigest()[:7] PROMPT_VERSION = hashlib.sha256(SYSTEM_PROMPT.encode("utf-8")).hexdigest()[:7]
USER_TEMPLATE = """\ USER_TEMPLATE = """\
## Proposed Regulation ## Proposed Regulation
Title: {reg_title} Title: {reg_title}
@@ -141,17 +124,15 @@ MAX_COMMENT_CHARS = 6000
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Directories # Directories
_SCRIPT_DIR = Path(__file__).parent _SCRIPT_DIR = Path(__file__).parent
REQUESTS_DIR = _SCRIPT_DIR / "requests" JOBS_DIR = _SCRIPT_DIR / "jobs"
RAW_DIR = _SCRIPT_DIR / "raw"
RUNS_DIR = _SCRIPT_DIR / "runs"
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Core functions (importable for tests) # Core functions (importable for tests)
def load_items(path: Path) -> tuple[dict | None, list[dict]]: def load_items(path: Path) -> tuple[dict | None, list[dict]]:
"""Read a scraped JSONL file. Returns (forum_item_or_None, [comment_items]).""" """Read a scraped JSONL. Returns (forum_item_or_None, [comment_items])."""
forum = None forum = None
comments = [] comments = []
with open(path, encoding="utf-8") as f: with open(path, encoding="utf-8") as f:
@@ -172,7 +153,6 @@ def custom_id_from(comment_id: str) -> str:
def parse_custom_id(custom_id: str) -> str: def parse_custom_id(custom_id: str) -> str:
"""Return comment_id from a custom_id string."""
return custom_id.removeprefix("comment_") return custom_id.removeprefix("comment_")
@@ -180,7 +160,6 @@ def build_messages(comment: dict, forum: dict | None) -> tuple[list, bool]:
"""Build OpenAI messages for one comment. Returns (messages, truncated).""" """Build OpenAI messages for one comment. Returns (messages, truncated)."""
reg_title = (forum or {}).get("reg_title", "[unknown]") reg_title = (forum or {}).get("reg_title", "[unknown]")
reg_desc = (forum or {}).get("reg_desc", "[unknown]") reg_desc = (forum or {}).get("reg_desc", "[unknown]")
body = (comment.get("text") or "").strip() body = (comment.get("text") or "").strip()
truncated = False truncated = False
if not body: if not body:
@@ -188,7 +167,6 @@ def build_messages(comment: dict, forum: dict | None) -> tuple[list, bool]:
elif len(body) > MAX_COMMENT_CHARS: elif len(body) > MAX_COMMENT_CHARS:
body = body[:MAX_COMMENT_CHARS] + "... [truncated]" body = body[:MAX_COMMENT_CHARS] + "... [truncated]"
truncated = True truncated = True
user_text = USER_TEMPLATE.format( user_text = USER_TEMPLATE.format(
reg_title=reg_title, reg_title=reg_title,
reg_desc=reg_desc, reg_desc=reg_desc,
@@ -196,7 +174,6 @@ def build_messages(comment: dict, forum: dict | None) -> tuple[list, bool]:
comment_title=comment.get("title", ""), comment_title=comment.get("title", ""),
comment_text=body, comment_text=body,
) )
return [ return [
{"role": "system", "content": SYSTEM_PROMPT}, {"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_text}, {"role": "user", "content": user_text},
@@ -204,7 +181,6 @@ def build_messages(comment: dict, forum: dict | None) -> tuple[list, bool]:
def build_batch_request_line(comment: dict, forum: dict | None, model: str) -> dict: def build_batch_request_line(comment: dict, forum: dict | None, model: str) -> dict:
"""Build one line of the batch input JSONL."""
messages, _ = build_messages(comment, forum) messages, _ = build_messages(comment, forum)
return { return {
"custom_id": custom_id_from(comment["comment_id"]), "custom_id": custom_id_from(comment["comment_id"]),
@@ -227,14 +203,9 @@ def normalize_output_line(
model: str, model: str,
prompt_version: str, prompt_version: str,
) -> dict: ) -> dict:
"""Convert one raw batch output line into a normalized analysis record. """Convert one raw batch output line into a normalized analysis record."""
comment_lookup: {comment_id: CommentItem dict}
prompt_version: taken from the run manifest so it reflects what was submitted.
"""
comment_id = parse_custom_id(raw_line.get("custom_id", "")) comment_id = parse_custom_id(raw_line.get("custom_id", ""))
comment = comment_lookup.get(comment_id, {}) comment = comment_lookup.get(comment_id, {})
base = { base = {
"run_id": run_id, "run_id": run_id,
"forum_id": comment.get("forum_id", ""), "forum_id": comment.get("forum_id", ""),
@@ -245,20 +216,16 @@ def normalize_output_line(
"input_title": comment.get("title", ""), "input_title": comment.get("title", ""),
"truncated": len(comment.get("text") or "") > MAX_COMMENT_CHARS, "truncated": len(comment.get("text") or "") > MAX_COMMENT_CHARS,
} }
# Check for outer-level batch error (e.g. batch_expired)
if raw_line.get("error"): if raw_line.get("error"):
err = raw_line["error"] err = raw_line["error"]
err_msg = err.get("message", str(err)) if isinstance(err, dict) else str(err) err_msg = err.get("message", str(err)) if isinstance(err, dict) else str(err)
return {**base, "stance": None, "stance_confidence": None, return {**base, "stance": None, "stance_confidence": None,
"stance_rationale": None, "tone": None, "tags": None, "error": err_msg} "stance_rationale": None, "tone": None, "tags": None, "error": err_msg}
response = raw_line.get("response") or {} response = raw_line.get("response") or {}
if response.get("status_code") != 200: if response.get("status_code") != 200:
return {**base, "stance": None, "stance_confidence": None, return {**base, "stance": None, "stance_confidence": None,
"stance_rationale": None, "tone": None, "tags": None, "stance_rationale": None, "tone": None, "tags": None,
"error": f"status {response.get('status_code')}"} "error": f"status {response.get('status_code')}"}
try: try:
content = response["body"]["choices"][0]["message"]["content"] content = response["body"]["choices"][0]["message"]["content"]
data = json.loads(content) data = json.loads(content)
@@ -270,278 +237,372 @@ def normalize_output_line(
"stance_rationale": None, "tone": None, "tags": None, "error": str(exc)} "stance_rationale": None, "tone": None, "tags": None, "error": str(exc)}
def make_manifest( # ---------------------------------------------------------------------------
run_id: str, # Job directory management
input_filename: str,
input_sha256: str,
model: str, def _next_job_dir(stem: str) -> Path:
batch_id: str, base = stem[:8]
records_submitted: int, i = 1
request_filename: str, while (JOBS_DIR / f"{base}-{i}").exists():
) -> dict: i += 1
return { return JOBS_DIR / f"{base}-{i}"
"run_id": run_id,
"input_filename": input_filename,
"input_sha256": input_sha256, def _latest_job_dir() -> Path:
"prompt_hash": PROMPT_VERSION, if not JOBS_DIR.exists():
"model": model, sys.exit(f"No jobs directory found. Run 'create' first.")
"batch_id": batch_id, status_files = list(JOBS_DIR.glob("*/status.json"))
"records_submitted": records_submitted, if not status_files:
"records_completed": None, sys.exit(f"No jobs found in {JOBS_DIR}. Run 'create' first.")
"records_failed": None, return max(status_files, key=lambda p: p.stat().st_mtime).parent
"request_filename": request_filename,
"raw_output_filename": None,
"normalized_output_filename": None, def _resolve_job_dir(args) -> Path:
"created_at": datetime.now(timezone.utc).isoformat(), if getattr(args, "dir", None):
"completed_at": None, d = Path(args.dir)
if not d.is_absolute():
d = JOBS_DIR / d
if not d.exists():
sys.exit(f"Job directory not found: {d}")
return d
return _latest_job_dir()
def load_status(job_dir: Path) -> dict:
return json.loads((job_dir / "status.json").read_text(encoding="utf-8"))
def save_status(status: dict, job_dir: Path) -> None:
(job_dir / "status.json").write_text(
json.dumps(status, indent=2, ensure_ascii=False), encoding="utf-8"
)
def _find_next_eligible_job(jobs: list[dict]) -> tuple[dict | None, str | None]:
"""Return (next_pending_job, None) or (None, warning_message).
A job is eligible when it is 'pending' and either it is the first job
or its predecessor is 'completed'.
"""
for j in jobs:
if j["status"] != "pending":
continue
if j["job_num"] == 1:
return j, None
prev = next(p for p in jobs if p["job_num"] == j["job_num"] - 1)
if prev["status"] == "completed":
return j, None
if prev["status"] in ("submitted", "in_progress", "validating", "finalizing"):
return None, (
f"Job {prev['job_num']} is '{prev['status']}'. "
f"Wait for it to complete before submitting job {j['job_num']}."
)
return None, None
# ---------------------------------------------------------------------------
# Subcommand: create
def cmd_create(args) -> None:
report_path = Path(args.report)
if not report_path.exists():
sys.exit(f"Report not found: {report_path}")
report = json.loads(report_path.read_text(encoding="utf-8"))
if args.model not in report or not isinstance(report[args.model], dict):
available = [k for k in report if isinstance(report.get(k), dict)]
sys.exit(f"Model '{args.model}' not in report. Available: {', '.join(available)}")
prompt_path = Path(report["prompt"])
if not prompt_path.exists():
sys.exit(f"Prompt file not found: {prompt_path}")
_load_prompt(prompt_path)
input_path = Path(report["input_file"])
if not input_path.exists():
sys.exit(f"Input file not found: {input_path}")
forum, comments = load_items(input_path)
if not comments:
sys.exit("No comment items found in input file.")
chunks = chunk_comments_by_tokens(comments, forum, args.model)
stem = input_path.stem[:8]
job_dir = _next_job_dir(stem)
JOBS_DIR.mkdir(parents=True, exist_ok=True)
job_dir.mkdir()
shutil.copy2(input_path, job_dir / "forum.jsonl")
shutil.copy2(prompt_path, job_dir / "prompt.txt")
shutil.copy2(report_path, job_dir / "report.json")
jobs_meta = []
for i, chunk in enumerate(chunks, start=1):
req_path = job_dir / f"job{i}-input.jsonl"
with open(req_path, "w", encoding="utf-8") as f:
for comment in chunk:
f.write(json.dumps(build_batch_request_line(comment, forum, args.model),
ensure_ascii=False) + "\n")
jobs_meta.append({
"job_num": i,
"run_id": str(uuid.uuid4()),
"status": "pending",
"batch_id": None,
"records_submitted": len(chunk),
"records_completed": None,
"records_failed": None,
"submitted_at": None,
"completed_at": None,
})
model_info = report[args.model]
status = {
"model": args.model,
"prompt_hash": report["prompt_hash"],
"input_file": str(input_path),
"input_sha256": report["input_sha256"],
"total_comments": report["total_comments"],
"input_tokens": report["input_tokens"],
"est_queue_days": model_info["est_queue_days"],
"cost_$": model_info["cost_$"],
"total_jobs": len(chunks),
"jobs": jobs_meta,
} }
save_status(status, job_dir)
print(f"Created: {job_dir.name}")
def _latest_run_id() -> str: print(f" {len(chunks)} job(s) | {len(comments)} comments | model: {args.model}")
"""Return the run_id of the most recently saved manifest, or exit if none found.""" print(f"\nNext: python analysis/gpt4o/analysis_batch.py submit")
runs = list(RUNS_DIR.glob("*.json")) if RUNS_DIR.exists() else []
if not runs:
sys.exit(f"No runs found in {RUNS_DIR}. Submit a batch first.")
latest = max(runs, key=lambda p: p.stat().st_mtime)
return latest.stem
def load_manifest(run_id: str) -> dict:
path = RUNS_DIR / f"{run_id}.json"
return json.loads(path.read_text(encoding="utf-8"))
def save_manifest(manifest: dict) -> None:
RUNS_DIR.mkdir(parents=True, exist_ok=True)
path = RUNS_DIR / f"{manifest['run_id']}.json"
path.write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Subcommand: submit # Subcommand: submit
def _submit_chunk(
chunk: list[dict],
forum: dict | None,
input_path: Path,
input_sha256: str,
model: str,
client,
chunk_index: int,
total_chunks: int,
) -> str:
"""Upload and submit one chunk of comments. Returns the run_id."""
import uuid
run_id = str(uuid.uuid4())
label = f"chunk {chunk_index + 1}/{total_chunks}" if total_chunks > 1 else "single batch"
REQUESTS_DIR.mkdir(parents=True, exist_ok=True) def cmd_submit(args, client) -> None:
request_path = REQUESTS_DIR / f"{run_id}.jsonl" job_dir = _resolve_job_dir(args)
with open(request_path, "w", encoding="utf-8") as f: status = load_status(job_dir)
for comment in chunk: jobs = status["jobs"]
line = build_batch_request_line(comment, forum, model)
f.write(json.dumps(line, ensure_ascii=False) + "\n")
print(f"[{label}] Wrote {len(chunk)} requests → {request_path}", file=sys.stderr) if args.job:
target = next((j for j in jobs if j["job_num"] == args.job), None)
if target is None:
sys.exit(f"Job {args.job} not found in {job_dir.name}.")
if target["status"] != "pending":
sys.exit(f"Job {args.job} is already '{target['status']}' — cannot resubmit.")
if target["job_num"] > 1:
prev = next(p for p in jobs if p["job_num"] == target["job_num"] - 1)
if prev["status"] != "completed":
sys.exit(
f"Cannot submit job {target['job_num']}: "
f"job {prev['job_num']} is '{prev['status']}' (must be 'completed')."
)
else:
target, warning = _find_next_eligible_job(jobs)
if warning:
print(warning, file=sys.stderr)
sys.exit(1)
if target is None:
all_done = all(j["status"] == "completed" for j in jobs)
print("All jobs completed." if all_done else "No pending jobs eligible for submission.")
return
with open(request_path, "rb") as f: n = target["job_num"]
req_path = job_dir / f"job{n}-input.jsonl"
print(f"Submitting job {n}/{status['total_jobs']} ({target['records_submitted']} comments) ...",
file=sys.stderr)
with open(req_path, "rb") as f:
uploaded = client.files.create(file=f, purpose="batch") uploaded = client.files.create(file=f, purpose="batch")
print(f"[{label}] Uploaded: {uploaded.id}", file=sys.stderr)
batch = client.batches.create( batch = client.batches.create(
input_file_id=uploaded.id, input_file_id=uploaded.id,
endpoint="/v1/chat/completions", endpoint="/v1/chat/completions",
completion_window="24h", completion_window="24h",
metadata={"run_id": run_id, "input_filename": str(input_path)}, metadata={"run_id": target["run_id"], "job_dir": job_dir.name},
)
print(f"[{label}] Batch created: {batch.id} status={batch.status}", file=sys.stderr)
manifest = make_manifest(
run_id=run_id,
input_filename=str(input_path),
input_sha256=input_sha256,
model=model,
batch_id=batch.id,
records_submitted=len(chunk),
request_filename=str(request_path),
)
save_manifest(manifest)
return run_id
def cmd_submit(args, client) -> None:
_load_prompt(Path(args.prompt))
print(f"Prompt: {args.prompt} (version {PROMPT_VERSION})", file=sys.stderr)
input_path = Path(args.input)
if not input_path.exists():
sys.exit(f"File not found: {input_path}")
print(f"Reading {input_path} ...", file=sys.stderr)
forum, comments = load_items(input_path)
if not comments:
sys.exit("No comment items found in input file.")
if forum is None:
print("Warning: no ForumItem found — regulation context will be [unknown].", file=sys.stderr)
if args.limit:
comments = comments[:args.limit]
print(f"Limiting to {len(comments)} comments (--limit {args.limit}).", file=sys.stderr)
token_limit = int(MODEL_LIMITS.get(args.model, _DEFAULT_TOKEN_LIMIT) * _LIMIT_BUFFER)
chunks = chunk_comments_by_tokens(comments, forum, args.model)
total = len(chunks)
print(
f"Model: {args.model} token limit: {token_limit:,} "
f"{len(comments)} comments split into {total} chunk(s).",
file=sys.stderr,
) )
input_sha256 = hashlib.sha256(input_path.read_bytes()).hexdigest() target["status"] = "submitted"
target["batch_id"] = batch.id
target["submitted_at"] = datetime.now(timezone.utc).isoformat()
save_status(status, job_dir)
# Submit only the first chunk — the enqueued token limit is a TOTAL across all print(f"Job {n} submitted: {batch.id} ({batch.status})")
# concurrent batches, so stacking multiple submissions will exceed the quota. print(f" python analysis/gpt4o/analysis_batch.py status")
# Wait for each batch to complete before submitting the next.
run_id = _submit_chunk(chunks[0], forum, input_path, input_sha256, args.model, client, 0, total)
print(f"\nBatch 1/{total} submitted.", file=sys.stderr)
print(f" status: python analysis/gpt4o/analysis_batch.py status {run_id}", file=sys.stderr)
print(f" download: python analysis/gpt4o/analysis_batch.py download {run_id}", file=sys.stderr)
if total > 1:
remaining = sum(len(c) for c in chunks[1:])
print(f"\n{total - 1} more chunk(s) remaining ({remaining} comments).", file=sys.stderr)
print("After this batch completes and is downloaded, rerun submit with --limit to get the next chunk:", file=sys.stderr)
offset = len(chunks[0])
for idx, chunk in enumerate(chunks[1:], start=2):
print(f" chunk {idx}/{total}: comments {offset}{offset + len(chunk) - 1}", file=sys.stderr)
offset += len(chunk)
print(run_id) # stdout for scripting
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Subcommand: status # Subcommand: status
def cmd_status(args, client) -> None: def cmd_status(args, client) -> None:
run_id = args.run_id or _latest_run_id() job_dir = _resolve_job_dir(args)
if not args.run_id: status = load_status(job_dir)
print(f"(using latest run: {run_id})", file=sys.stderr) jobs = status["jobs"]
manifest = load_manifest(run_id)
batch = client.batches.retrieve(manifest["batch_id"])
counts = batch.request_counts job_filter = getattr(args, "job", None)
print(f"status: {batch.status}")
print(f"completed: {counts.completed}/{counts.total}")
print(f"failed: {counts.failed}")
manifest["records_completed"] = counts.completed for job in jobs:
manifest["records_failed"] = counts.failed if job_filter is not None and job["job_num"] != job_filter:
save_manifest(manifest) continue
if not job["batch_id"]:
continue
if job["status"] in ("completed", "failed", "expired", "cancelled", "pending"):
continue
batch = client.batches.retrieve(job["batch_id"])
counts = batch.request_counts
if batch.status == "completed":
job["status"] = "completed"
if batch.completed_at:
job["completed_at"] = datetime.fromtimestamp(
batch.completed_at, tz=timezone.utc
).isoformat()
elif batch.status in ("failed", "expired", "cancelled"):
job["status"] = batch.status
else:
job["status"] = batch.status
job["records_completed"] = counts.completed
job["records_failed"] = counts.failed
if batch.status == "completed": save_status(status, job_dir)
print(f"\nReady to download. Run:")
print(f" python analysis/gpt4o/analysis_batch.py download {run_id}") target_jobs = jobs if not job_filter else [j for j in jobs if j["job_num"] == job_filter]
print(f"Dir: {job_dir.name} | Model: {status['model']} | {status['total_jobs']} job(s)")
print(f"{'Job':<5} {'Status':<14} {'Records':>12} {'Submitted':<20} {'Completed':<20}")
print("-" * 76)
for j in target_jobs:
rec = (f"{j['records_completed']}/{j['records_submitted']}"
if j["records_completed"] is not None else f"-/{j['records_submitted']}")
sub = (j["submitted_at"] or "-")[:19]
done = (j["completed_at"] or "-")[:19]
print(f"{j['job_num']:<5} {j['status']:<14} {rec:>12} {sub:<20} {done:<20}")
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Subcommand: download # Subcommand: download
def cmd_download(args, client) -> None: def cmd_download(args, client) -> None:
run_id = args.run_id or _latest_run_id() job_dir = _resolve_job_dir(args)
if not args.run_id:
print(f"(using latest run: {run_id})", file=sys.stderr)
manifest = load_manifest(run_id)
batch = client.batches.retrieve(manifest["batch_id"])
if batch.status != "completed": # Refresh status before deciding what to download
sys.exit(f"Batch not complete yet (status={batch.status}). Run 'status' to check.") cmd_status(args, client)
status = load_status(job_dir)
jobs = status["jobs"]
run_id = manifest["run_id"] job_filter = getattr(args, "job", None)
model = manifest["model"] if job_filter:
model_slug = model.replace("/", "-") candidates = [j for j in jobs if j["job_num"] == job_filter]
else:
candidates = [
j for j in jobs
if j["status"] == "completed"
and not (job_dir / f"job{j['job_num']}-output.jsonl").exists()
]
# Download raw output if not candidates:
RAW_DIR.mkdir(parents=True, exist_ok=True) print("No completed jobs pending download.", file=sys.stderr)
raw_path = RAW_DIR / f"{run_id}.jsonl" return
raw_text = client.files.content(batch.output_file_id).text
raw_path.write_text(raw_text, encoding="utf-8")
print(f"Raw output → {raw_path}", file=sys.stderr)
# Build comment lookup from original input for reconciliation _, all_comments = load_items(job_dir / "forum.jsonl")
input_path = Path(manifest["input_filename"]) comment_lookup = {c["comment_id"]: c for c in all_comments}
_, comments = load_items(input_path)
comment_lookup = {c["comment_id"]: c for c in comments}
# Normalize for job in candidates:
completed_at = datetime.now(timezone.utc).isoformat() n = job["job_num"]
if batch.completed_at:
completed_at = datetime.fromtimestamp(batch.completed_at, tz=timezone.utc).isoformat()
normalized_path = _SCRIPT_DIR / f"{run_id}_{model_slug}.jsonl" if job["status"] != "completed":
n_ok = n_err = 0 print(f"Job {n} not yet completed ('{job['status']}'), skipping.", file=sys.stderr)
with open(normalized_path, "w", encoding="utf-8") as out: continue
for line in raw_text.splitlines():
if not line.strip():
continue
raw_line = json.loads(line)
record = normalize_output_line(raw_line, comment_lookup, run_id, completed_at, model, manifest["prompt_hash"])
out.write(json.dumps(record, ensure_ascii=False) + "\n")
if record["error"]:
n_err += 1
else:
n_ok += 1
print(f"Normalized → {normalized_path} ({n_ok} ok, {n_err} errors)", file=sys.stderr) batch = client.batches.retrieve(job["batch_id"])
manifest["records_completed"] = n_ok if not batch.output_file_id:
manifest["records_failed"] = n_err print(f"Job {n}: no output file available from OpenAI.", file=sys.stderr)
manifest["raw_output_filename"] = str(raw_path) continue
manifest["normalized_output_filename"] = str(normalized_path)
manifest["completed_at"] = completed_at raw_text = client.files.content(batch.output_file_id).text
save_manifest(manifest) raw_path = job_dir / f"job{n}-output-raw.jsonl"
print(f"Manifest updated → {RUNS_DIR / run_id}.json", file=sys.stderr) raw_path.write_text(raw_text, encoding="utf-8")
print(f"Job {n} raw → {raw_path.name}", file=sys.stderr)
if batch.error_file_id:
err_text = client.files.content(batch.error_file_id).text
err_path = job_dir / f"job{n}-errors.jsonl"
err_path.write_text(err_text, encoding="utf-8")
n_err_lines = sum(1 for line in err_text.splitlines() if line.strip())
print(f"Job {n} errors → {err_path.name} ({n_err_lines} lines)", file=sys.stderr)
completed_at = job.get("completed_at") or datetime.now(timezone.utc).isoformat()
norm_path = job_dir / f"job{n}-output.jsonl"
n_ok = n_err = 0
with open(norm_path, "w", encoding="utf-8") as out:
for line in raw_text.splitlines():
if not line.strip():
continue
record = normalize_output_line(
json.loads(line), comment_lookup,
job["run_id"], completed_at,
status["model"], status["prompt_hash"],
)
out.write(json.dumps(record, ensure_ascii=False) + "\n")
if record["error"]:
n_err += 1
else:
n_ok += 1
print(f"Job {n} normalized → {norm_path.name} ({n_ok} ok, {n_err} errors)", file=sys.stderr)
job["records_completed"] = n_ok
job["records_failed"] = n_err
save_status(status, job_dir)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# CLI # CLI
def _add_common_args(p: argparse.ArgumentParser) -> None:
p.add_argument("--job", type=int, default=None, metavar="N",
help="Job number within the run (default: auto)")
p.add_argument("--dir", default=None, metavar="DIR",
help="Job directory name or path (default: most recent)")
def main() -> None: def main() -> None:
load_dotenv() load_dotenv()
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
sys.exit("OPENAI_API_KEY not set. Create a .env file or export the variable.")
parser = argparse.ArgumentParser( parser = argparse.ArgumentParser(
description="Public comment batch analysis pipeline.", description="Batch analysis job runner.",
formatter_class=argparse.RawDescriptionHelpFormatter, formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__, epilog=__doc__,
) )
sub = parser.add_subparsers(dest="command", required=True) sub = parser.add_subparsers(dest="command", required=True)
p_submit = sub.add_parser("submit", help="Build and submit a batch job") p_create = sub.add_parser("create", help="Create job directory from tokenizer report")
p_submit.add_argument("input", help="Path to scraped JSONL file") p_create.add_argument("report", help="Path to report.json from tokenizer.py")
p_submit.add_argument("--model", default="gpt-4o", help="OpenAI model (default: gpt-4o)") p_create.add_argument("--model", required=True, help="Model (e.g. gpt-4o-mini)")
p_submit.add_argument(
"--prompt",
default=str(_DEFAULT_PROMPT_FILE),
help="Path to system prompt file (default: analysis/prompt-1.txt)",
)
p_submit.add_argument(
"--limit", type=int, default=None, metavar="N",
help="Submit only the first N comments (useful for staying under token quota)",
)
p_status = sub.add_parser("status", help="Check batch status") p_submit = sub.add_parser("submit", help="Submit next eligible job")
p_status.add_argument("run_id", nargs="?", default=None, _add_common_args(p_submit)
help="run_id from submit (default: most recent run)")
p_download = sub.add_parser("download", help="Download and normalize completed batch") p_status = sub.add_parser("status", help="Check job status")
p_download.add_argument("run_id", nargs="?", default=None, _add_common_args(p_status)
help="run_id from submit (default: most recent run)")
p_download = sub.add_parser("download", help="Download and normalize completed jobs")
_add_common_args(p_download)
args = parser.parse_args() args = parser.parse_args()
if args.command == "create":
cmd_create(args)
return
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
sys.exit("OPENAI_API_KEY not set. Create a .env file or export the variable.")
client = openai.OpenAI(api_key=api_key) client = openai.OpenAI(api_key=api_key)
if args.command == "submit": if args.command == "submit":

141
analysis/gpt4o/tokenizer.py Normal file
View File

@@ -0,0 +1,141 @@
#!/usr/bin/env python3
"""
tokenizer.py — estimate token usage and cost for a batch analysis run.
Usage:
python analysis/gpt4o/tokenizer.py output/f452.jsonl [--prompt analysis/prompt-1.txt]
Prints a per-model comparison table and writes report.json next to the input file.
Run this before analysis_batch.py create.
"""
import argparse
import hashlib
import json
import math
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
import analysis_batch as _ab
# Input pricing ($/1M tokens, batch API) — from docs/openai.md, updated 2026-05-05.
# Add Anthropic/other models here when needed; only models with a LIMITS entry are reported.
MODEL_PRICING: dict[str, float] = {
"gpt-5.5": 2.50,
"gpt-5.4": 1.25,
"gpt-5.4-mini": 0.375,
"gpt-5.4-nano": 0.10,
"gpt-4o": 1.25,
"gpt-4o-mini": 0.075,
"gpt-o4-mini": 0.55,
}
def compute_report(
comments: list[dict],
forum: dict | None,
prompt_hash: str,
input_file: str,
input_sha256: str,
prompt_file: str,
) -> dict:
"""Compute token estimate and per-model job/cost/time breakdown."""
# Use gpt-4o encoding as the canonical estimator (same for all current models)
total_tokens = sum(
_ab.estimate_tokens(_ab.build_messages(c, forum)[0], "gpt-4o")
for c in comments
)
report: dict = {
"prompt": prompt_file,
"prompt_hash": prompt_hash,
"input_file": input_file,
"input_sha256": input_sha256,
"total_comments": len(comments),
"input_tokens": total_tokens,
}
for model, tpd in _ab.MODEL_LIMITS.items():
effective_tpd = int(tpd * _ab._LIMIT_BUFFER)
jobs = math.ceil(total_tokens / effective_tpd)
cost = round(total_tokens / 1_000_000 * MODEL_PRICING.get(model, 0.0), 4)
est_days = round(total_tokens / tpd, 2)
report[model] = {"jobs": jobs, "cost_$": cost, "est_queue_days": est_days}
return report
def print_table(report: dict) -> None:
"""Print a human-readable model comparison table to stdout."""
print(f"\nInput: {report['input_file']}")
print(f"Comments: {report['total_comments']:,}")
print(f"Tokens: {report['input_tokens']:,}")
print(f"Prompt: {report['prompt']} (hash: {report['prompt_hash']})")
print()
# Cheapest model that fits in one job
single_job_models = [m for m in _ab.MODEL_LIMITS if report.get(m, {}).get("jobs") == 1]
best = (min(single_job_models, key=lambda m: report[m]["cost_$"])
if single_job_models else None)
print(f"{'Model':<15} {'Jobs':>5} {'Cost ($)':>9} {'Est days':>9} {'Note'}")
print("-" * 62)
for model in _ab.MODEL_LIMITS:
if model not in report or not isinstance(report[model], dict):
continue
m = report[model]
note = "<-- recommended" if model == best else ""
print(f"{model:<15} {m['jobs']:>5} {m['cost_$']:>9.4f} {m['est_queue_days']:>9.2f} {note}")
print()
def main() -> None:
_default_prompt = Path(__file__).parent.parent / "prompt-1.txt"
parser = argparse.ArgumentParser(description="Estimate batch token usage and cost.")
parser.add_argument("input", help="Scraped JSONL file")
parser.add_argument(
"--prompt",
default=str(_default_prompt),
help=f"System prompt file (default: {_default_prompt.name})",
)
args = parser.parse_args()
input_path = Path(args.input)
if not input_path.exists():
sys.exit(f"File not found: {input_path}")
prompt_path = Path(args.prompt)
if not prompt_path.exists():
sys.exit(f"Prompt file not found: {prompt_path}")
prompt_text = prompt_path.read_text(encoding="utf-8").strip()
prompt_hash = hashlib.sha256(prompt_text.encode("utf-8")).hexdigest()[:7]
# Ensure build_messages uses the specified prompt
_ab._load_prompt(prompt_path)
forum, comments = _ab.load_items(input_path)
if not comments:
sys.exit("No comment items found.")
if forum is None:
print("Warning: no ForumItem — token estimates may be slightly low.", file=sys.stderr)
input_sha256 = hashlib.sha256(input_path.read_bytes()).hexdigest()
report = compute_report(
comments, forum, prompt_hash,
str(input_path), input_sha256, str(prompt_path),
)
print_table(report)
out_path = input_path.parent / "report.json"
out_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
print(f"Report written to: {out_path}")
print(f"\nNext: python analysis/gpt4o/analysis_batch.py create {out_path} --model <model>")
if __name__ == "__main__":
main()

View File

@@ -158,7 +158,7 @@ forum_id_input,comment_id,title,text,date,author,stance,stance_confidence,stance
- tests: 23 passing (pytest tests/analysis_gpt4o_batch.py), 51 total across suite - tests: 23 passing (pytest tests/analysis_gpt4o_batch.py), 51 total across suite
- datetime: [2026-05-06 Wed 08:55] - datetime: [2026-05-06 Wed 08:55]
* [ ] t1.2.3: batch job refactor * [X] t1.2.3: batch job refactor
This task encompasses intent and fixes for 1.2.1 and 1.2.2. This task encompasses intent and fixes for 1.2.1 and 1.2.2.
batch processing should be a resumable job queue, not a one-shot script. the user should not need to remember offsets, completed chunks, failed batches, or which comments remain. batch processing should be a resumable job queue, not a one-shot script. the user should not need to remember offsets, completed chunks, failed batches, or which comments remain.
** Acceptance Criteria ** Acceptance Criteria
@@ -200,6 +200,46 @@ batch processing should be a resumable job queue, not a one-shot script. the us
- resume from status.json - resume from status.json
- remaining-comment detection - remaining-comment detection
** notes
- analysis/gpt4o/tokenizer.py: new standalone script; imports analysis_batch for MODEL_LIMITS, estimate_tokens, build_messages. Reads input JSONL + prompt, computes per-model jobs/cost/time table, writes report.json to input file's directory. MODEL_PRICING dict lives here (not in analysis_batch).
- analysis/gpt4o/analysis_batch.py: fully rewritten with four subcommands: create, submit, status, download. No longer uses REQUESTS_DIR / RAW_DIR / RUNS_DIR.
- Job directories: analysis/gpt4o/jobs/<stem[:8]>-N/ (e.g. f452-1). Each run is self-contained: forum.jsonl, prompt.txt, report.json, jobN-input.jsonl, jobN-output-raw.jsonl, jobN-output.jsonl, jobN-errors.jsonl.
- status.json: tracks all jobs with pending/submitted/in_progress/completed/failed states. Updated by submit, status, download.
- _find_next_eligible_job: pure function for testability. Returns (next_pending_job, None) or (None, warning). Blocks submission if previous job is in_progress/submitted.
- create: no API key required. Reads report.json, re-chunks comments, writes all jobN-input.jsonl files, writes status.json.
- submit: uploads jobN-input.jsonl to Files API, creates batch, updates status.json to 'submitted'. Will not stack batches.
- status: retrieves batch from OpenAI, updates status.json counts and status.
- download: auto-runs status first, downloads output_file_id → jobN-output-raw.jsonl, error_file_id → jobN-errors.jsonl, normalizes → jobN-output.jsonl. Updates status.json.
- tests/test_tokenizer.py: 15 tests for compute_report schema, cost/time calculation, MODEL_PRICING coverage, print_table output, report.json round-trip.
*** usage
#+begin_src sh
# 1. estimate tokens and cost
python analysis/gpt4o/tokenizer.py output/f452.jsonl --prompt analysis/prompt-1.txt
# writes output/report.json
# 2. create job directory (no api key needed)
python analysis/gpt4o/analysis_batch.py create output/report.json --model gpt-4o-mini
# creates analysis/gpt4o/jobs/f452-1/
# 3. submit first job
python analysis/gpt4o/analysis_batch.py submit
# 4. check status (repeat until completed)
python analysis/gpt4o/analysis_batch.py status
# 5. download and normalize
python analysis/gpt4o/analysis_batch.py download
# 6. submit next job (if multi-job run), then repeat 4-5
python analysis/gpt4o/analysis_batch.py submit
#+end_src
** evidence
- commit:
- tests: passing (pytest tests/analysis_gpt4o_batch.py tests/test_tokenizer.py)
- datetime: [2026-05-05 Tue]
* === Backlog === * === Backlog ===
* [ ] X: analysis validation view * [ ] X: analysis validation view
create a lightweight validation script that joins raw comments to normalized analysis output and writes a human-reviewable csv. create a lightweight validation script that joins raw comments to normalized analysis output and writes a human-reviewable csv.

View File

@@ -75,9 +75,24 @@ ANALYZED_AT = "2026-05-05T18:00:00+00:00"
RUN_ID = "test-run-id-123" RUN_ID = "test-run-id-123"
MODEL = "gpt-4o" MODEL = "gpt-4o"
# Minimal status.json for testing job logic
def _make_status(jobs_override=None):
jobs = jobs_override or [
{"job_num": 1, "run_id": "r1", "status": "pending", "batch_id": None,
"records_submitted": 60, "records_completed": None, "records_failed": None,
"submitted_at": None, "completed_at": None},
]
return {
"model": "gpt-4o-mini", "prompt_hash": "abc1234",
"input_file": "output/f452.jsonl", "input_sha256": "sha",
"total_comments": 100, "input_tokens": 50_000,
"est_queue_days": 0.025, "cost_$": 0.01,
"total_jobs": len(jobs), "jobs": jobs,
}
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Prompt versioning (batch reads the same prompt file) # Prompt versioning
def test_prompt_version_is_7_hex_chars(): def test_prompt_version_is_7_hex_chars():
assert len(bt.PROMPT_VERSION) == 7 assert len(bt.PROMPT_VERSION) == 7
@@ -206,52 +221,6 @@ def test_normalize_unknown_comment_id():
assert record["input_title"] == "" assert record["input_title"] == ""
# ---------------------------------------------------------------------------
# Manifest
def test_make_manifest_all_keys():
m = bt.make_manifest(
run_id=RUN_ID,
input_filename="output/forum452.jsonl",
input_sha256="abc123",
model="gpt-4o",
batch_id="batch_xyz",
records_submitted=100,
request_filename="analysis/gpt4o/requests/test-run-id-123.jsonl",
)
required = {
"run_id", "input_filename", "input_sha256", "prompt_hash", "model",
"batch_id", "records_submitted", "records_completed", "records_failed",
"request_filename", "raw_output_filename", "normalized_output_filename",
"created_at", "completed_at",
}
assert required == set(m.keys())
def test_make_manifest_initial_nulls():
m = bt.make_manifest(
run_id=RUN_ID, input_filename="f", input_sha256="s",
model="gpt-4o", batch_id="b", records_submitted=10, request_filename="r",
)
assert m["records_completed"] is None
assert m["records_failed"] is None
assert m["raw_output_filename"] is None
assert m["normalized_output_filename"] is None
assert m["completed_at"] is None
assert m["prompt_hash"] == bt.PROMPT_VERSION
def test_manifest_save_load_roundtrip(tmp_path, monkeypatch):
monkeypatch.setattr(bt, "RUNS_DIR", tmp_path)
m = bt.make_manifest(
run_id=RUN_ID, input_filename="f", input_sha256="s",
model="gpt-4o", batch_id="b", records_submitted=42, request_filename="r",
)
bt.save_manifest(m)
loaded = bt.load_manifest(RUN_ID)
assert loaded == m
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# estimate_tokens # estimate_tokens
@@ -309,3 +278,112 @@ def test_chunk_preserves_all_comments(monkeypatch):
def test_model_limits_has_required_models(): def test_model_limits_has_required_models():
for model in ("gpt-4o", "gpt-4o-mini", "gpt-5.4", "gpt-5.4-mini", "gpt-o4-mini"): for model in ("gpt-4o", "gpt-4o-mini", "gpt-5.4", "gpt-5.4-mini", "gpt-o4-mini"):
assert model in bt.MODEL_LIMITS, f"{model} missing from MODEL_LIMITS" assert model in bt.MODEL_LIMITS, f"{model} missing from MODEL_LIMITS"
# ---------------------------------------------------------------------------
# status.json helpers
def test_status_save_load_roundtrip(tmp_path):
status = _make_status()
bt.save_status(status, tmp_path)
loaded = bt.load_status(tmp_path)
assert loaded == status
# ---------------------------------------------------------------------------
# _find_next_eligible_job
def test_find_next_eligible_job_first_job_pending():
jobs = _make_status()["jobs"]
target, warning = bt._find_next_eligible_job(jobs)
assert target["job_num"] == 1
assert warning is None
def test_find_next_eligible_job_after_completed():
jobs = [
{"job_num": 1, "status": "completed", "batch_id": "b1",
"records_submitted": 60, "records_completed": 60, "records_failed": 0,
"submitted_at": "t", "completed_at": "t", "run_id": "r1"},
{"job_num": 2, "status": "pending", "batch_id": None,
"records_submitted": 40, "records_completed": None, "records_failed": None,
"submitted_at": None, "completed_at": None, "run_id": "r2"},
]
target, warning = bt._find_next_eligible_job(jobs)
assert target["job_num"] == 2
assert warning is None
def test_find_next_eligible_job_blocked_by_in_progress():
jobs = [
{"job_num": 1, "status": "in_progress", "batch_id": "b1",
"records_submitted": 60, "records_completed": None, "records_failed": None,
"submitted_at": "t", "completed_at": None, "run_id": "r1"},
{"job_num": 2, "status": "pending", "batch_id": None,
"records_submitted": 40, "records_completed": None, "records_failed": None,
"submitted_at": None, "completed_at": None, "run_id": "r2"},
]
target, warning = bt._find_next_eligible_job(jobs)
assert target is None
assert warning is not None
assert "in_progress" in warning
def test_find_next_eligible_job_all_completed():
jobs = [
{"job_num": 1, "status": "completed", "batch_id": "b1",
"records_submitted": 60, "records_completed": 60, "records_failed": 0,
"submitted_at": "t", "completed_at": "t", "run_id": "r1"},
]
target, warning = bt._find_next_eligible_job(jobs)
assert target is None
assert warning is None
def test_resume_from_status_json(tmp_path):
"""Reload a status.json with one completed job and find the next pending job."""
jobs = [
{"job_num": 1, "run_id": "r1", "status": "completed", "batch_id": "b1",
"records_submitted": 60, "records_completed": 58, "records_failed": 2,
"submitted_at": "2026-05-06T10:00:00+00:00", "completed_at": "2026-05-06T11:00:00+00:00"},
{"job_num": 2, "run_id": "r2", "status": "pending", "batch_id": None,
"records_submitted": 40, "records_completed": None, "records_failed": None,
"submitted_at": None, "completed_at": None},
]
bt.save_status(_make_status(jobs), tmp_path)
loaded = bt.load_status(tmp_path)
target, warning = bt._find_next_eligible_job(loaded["jobs"])
assert target["job_num"] == 2
assert warning is None
# ---------------------------------------------------------------------------
# normalize: out-of-order and duplicate custom_id
def test_out_of_order_output_reconciled_by_custom_id():
"""Raw lines processed in any order are mapped to the correct comment."""
c2 = {**COMMENT_ITEM, "comment_id": "99999", "title": "Second comment"}
lookup = {COMMENT_ITEM["comment_id"]: COMMENT_ITEM, "99999": c2}
line_for_99999 = {
**RAW_SUCCESS_LINE,
"custom_id": "comment_99999",
}
line_for_87914 = RAW_SUCCESS_LINE
r1 = bt.normalize_output_line(line_for_99999, lookup, RUN_ID, ANALYZED_AT, MODEL, bt.PROMPT_VERSION)
r2 = bt.normalize_output_line(line_for_87914, lookup, RUN_ID, ANALYZED_AT, MODEL, bt.PROMPT_VERSION)
assert r1["comment_id"] == "99999"
assert r1["input_title"] == "Second comment"
assert r2["comment_id"] == "87914"
assert r2["input_title"] == COMMENT_ITEM["title"]
def test_duplicate_custom_id_both_produce_valid_records():
"""Two raw lines with the same custom_id each produce a valid record."""
r1 = bt.normalize_output_line(RAW_SUCCESS_LINE, COMMENT_LOOKUP, RUN_ID, ANALYZED_AT, MODEL, bt.PROMPT_VERSION)
r2 = bt.normalize_output_line(RAW_SUCCESS_LINE, COMMENT_LOOKUP, RUN_ID, ANALYZED_AT, MODEL, bt.PROMPT_VERSION)
assert r1["comment_id"] == r2["comment_id"] == "87914"
assert r1["error"] is None
assert r2["error"] is None

201
tests/tokenizer.py Normal file
View File

@@ -0,0 +1,201 @@
"""Unit tests for analysis/gpt4o/tokenizer.py — no real API calls."""
import io
import json
import math
import sys
from pathlib import Path
from unittest.mock import patch
import pytest
sys.path.insert(0, str(Path(__file__).parent.parent / "analysis" / "gpt4o"))
import tokenizer as tk
import analysis_batch as ab
# ---------------------------------------------------------------------------
# Fixtures
FORUM_ITEM = {
"forum_id": "452",
"reg_title": "Model Policies for Transgender Students",
"reg_desc": "Guidance developed in response to HB 145.",
}
COMMENT_A = {
"forum_id": "452",
"comment_id": "100",
"author": "Alice",
"date": "2021-01-04T09:15:00",
"title": "Support",
"text": "I support this policy.",
}
COMMENT_B = {
"forum_id": "452",
"comment_id": "101",
"author": "Bob",
"date": "2021-01-05T10:00:00",
"title": "Oppose",
"text": "I oppose this policy.",
}
COMMENTS = [COMMENT_A, COMMENT_B]
PROMPT_HASH = "abc1234"
INPUT_FILE = "output/f452.jsonl"
INPUT_SHA256 = "deadbeef" * 8
PROMPT_FILE = "analysis/prompt-1.txt"
def _make_report(total_tokens=10_000):
return tk.compute_report(
COMMENTS, FORUM_ITEM, PROMPT_HASH, INPUT_FILE, INPUT_SHA256, PROMPT_FILE
)
# ---------------------------------------------------------------------------
# compute_report: required top-level keys
def test_report_has_top_level_keys():
report = _make_report()
required = {"prompt", "prompt_hash", "input_file", "input_sha256",
"total_comments", "input_tokens"}
assert required.issubset(set(report.keys()))
def test_report_metadata_values():
report = _make_report()
assert report["prompt"] == PROMPT_FILE
assert report["prompt_hash"] == PROMPT_HASH
assert report["input_file"] == INPUT_FILE
assert report["input_sha256"] == INPUT_SHA256
assert report["total_comments"] == 2
def test_report_input_tokens_positive():
report = _make_report()
assert isinstance(report["input_tokens"], int)
assert report["input_tokens"] > 0
# ---------------------------------------------------------------------------
# compute_report: per-model entries
def test_report_has_per_model_keys():
report = _make_report()
for model in ab.MODEL_LIMITS:
assert model in report, f"Model {model} missing from report"
assert isinstance(report[model], dict)
def test_report_per_model_has_required_fields():
report = _make_report()
for model in ab.MODEL_LIMITS:
m = report[model]
assert "jobs" in m
assert "cost_$" in m
assert "est_queue_days" in m
def test_report_jobs_at_least_one():
report = _make_report()
for model in ab.MODEL_LIMITS:
assert report[model]["jobs"] >= 1
# ---------------------------------------------------------------------------
# compute_report: calculation accuracy
def test_cost_calculation():
"""cost_$ = total_tokens / 1M * pricing_rate"""
report = _make_report()
total = report["input_tokens"]
for model in ab.MODEL_LIMITS:
expected_cost = round(total / 1_000_000 * tk.MODEL_PRICING.get(model, 0.0), 4)
assert report[model]["cost_$"] == pytest.approx(expected_cost, abs=1e-6)
def test_est_queue_days_calculation():
"""est_queue_days = total_tokens / tpd (rounded to 2 decimal places)"""
report = _make_report()
total = report["input_tokens"]
for model, tpd in ab.MODEL_LIMITS.items():
expected = round(total / tpd, 2)
assert report[model]["est_queue_days"] == pytest.approx(expected, abs=1e-4)
def test_jobs_ceiling_division():
"""jobs = ceil(total_tokens / (tpd * _LIMIT_BUFFER))"""
report = _make_report()
total = report["input_tokens"]
for model, tpd in ab.MODEL_LIMITS.items():
effective = int(tpd * ab._LIMIT_BUFFER)
expected = math.ceil(total / effective)
assert report[model]["jobs"] == expected
def test_more_comments_increases_tokens():
"""More comments → more input_tokens."""
few = tk.compute_report([COMMENT_A], FORUM_ITEM, PROMPT_HASH, INPUT_FILE, INPUT_SHA256, PROMPT_FILE)
many = tk.compute_report(COMMENTS, FORUM_ITEM, PROMPT_HASH, INPUT_FILE, INPUT_SHA256, PROMPT_FILE)
assert many["input_tokens"] > few["input_tokens"]
# ---------------------------------------------------------------------------
# MODEL_PRICING coverage
def test_model_pricing_has_required_models():
for model in ("gpt-4o", "gpt-4o-mini", "gpt-5.4", "gpt-5.4-mini", "gpt-o4-mini"):
assert model in tk.MODEL_PRICING, f"{model} missing from MODEL_PRICING"
def test_model_pricing_values_positive():
for model, price in tk.MODEL_PRICING.items():
assert price > 0, f"{model} has non-positive price"
# ---------------------------------------------------------------------------
# print_table: runs without error, produces output
def test_print_table_runs():
report = _make_report()
buf = io.StringIO()
with patch("sys.stdout", buf):
tk.print_table(report)
output = buf.getvalue()
assert "gpt-4o" in output
assert "gpt-4o-mini" in output
def test_print_table_shows_all_models():
report = _make_report()
buf = io.StringIO()
with patch("sys.stdout", buf):
tk.print_table(report)
output = buf.getvalue()
for model in ab.MODEL_LIMITS:
assert model in output, f"{model} not shown in print_table output"
def test_print_table_highlights_recommended():
"""When a single-job cheapest model exists, table marks it as recommended."""
report = _make_report()
buf = io.StringIO()
with patch("sys.stdout", buf):
tk.print_table(report)
output = buf.getvalue()
assert "recommended" in output
# ---------------------------------------------------------------------------
# report.json round-trip (write → read)
def test_report_json_roundtrip(tmp_path):
report = _make_report()
out = tmp_path / "report.json"
out.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
loaded = json.loads(out.read_text(encoding="utf-8"))
assert loaded["total_comments"] == report["total_comments"]
assert loaded["input_tokens"] == report["input_tokens"]
assert loaded["gpt-4o-mini"]["jobs"] == report["gpt-4o-mini"]["jobs"]