openai batch refactor

2026-05-06 13:53:50 -04:00
parent 946aeac7c8
commit 64a7a18721
5 changed files with 833 additions and 312 deletions
--- a/analysis/gpt4o/analysis_batch.py
+++ b/analysis/gpt4o/analysis_batch.py
@@ -1,27 +1,23 @@
 #!/usr/bin/env python3
 """
-analysis_batch.py — OpenAI Batch API pipeline
+analysis_batch.py — OpenAI Batch API job runner

-Commands (run manually in order):
-    submit   <input_jsonl> [--model gpt-4o] [--limit N]
-                                           — build request file, upload, create batch
-    status   [run_id]                      — check batch status, update manifest
-    download [run_id]                      — download + normalize output, update manifest
+Run tokenizer.py first to generate report.json, then:
+    create   <report.json> --model <model>   — build job directory
+    submit   [--job N] [--dir DIR]           — submit next eligible job
+    status   [--job N] [--dir DIR]           — check job status
+    download [--job N] [--dir DIR]           — download + normalize completed jobs

-run_id defaults to the most recent run in runs/ when omitted.
-
-File layout (all under analysis/gpt4o/):
-    requests/<run_id>.jsonl     — batch input sent to OpenAI
-    raw/<run_id>.jsonl          — raw batch output from OpenAI
-    runs/<run_id>.json          — run manifest
-    <run_id>_<model>.jsonl      — normalized output (same schema as realtime)
+DIR is a name under analysis/gpt4o/jobs/ (default: most recently created).
 """

 import argparse
 import hashlib
 import json
 import os
+import shutil
 import sys
+import uuid
 from datetime import datetime, timezone
 from pathlib import Path

@@ -35,9 +31,8 @@ except ImportError:
 # ---------------------------------------------------------------------------
 # Model limits and token estimation

-# Max enqueued tokens across ALL concurrent batches for this model
-# (docs/openai.md pricing table, updated 2026-05-05).
-# NOTE: your org tier may be lower — if a submit fails, use --limit to reduce chunk size.
+# Max enqueued tokens across ALL concurrent batches (docs/openai.md, 2026-05-05).
+# Org-tier limits may be lower; use --job to limit submission size if needed.
 MODEL_LIMITS: dict[str, int] = {
    "gpt-5.5":        900_000,
    "gpt-5.4":        900_000,
@@ -48,8 +43,6 @@ MODEL_LIMITS: dict[str, int] = {
    "gpt-o4-mini":  2_000_000,
 }
 _DEFAULT_TOKEN_LIMIT = 900_000
-
-# tiktoken encoding per model family; unknown models fall back to o200k_base
 _MODEL_ENCODING: dict[str, str] = {
    "gpt-5.5":       "o200k_base",
    "gpt-5.4":       "o200k_base",
@@ -59,16 +52,11 @@ _MODEL_ENCODING: dict[str, str] = {
    "gpt-4o-mini":   "o200k_base",
    "gpt-o4-mini":   "o200k_base",
 }
-# Leave 10% headroom below the published limit
 _LIMIT_BUFFER = 0.90


 def estimate_tokens(messages: list[dict], model: str) -> int:
-    """Estimate token count for a messages list.
-
-    Uses tiktoken when available (exact for OpenAI models); falls back to
-    chars/3 + 4-token overhead per message for unknown/Anthropic models.
-    """
+    """Exact token count via tiktoken; falls back to chars/3 + 4 overhead per message."""
    try:
        import tiktoken
        enc = tiktoken.get_encoding(_MODEL_ENCODING.get(model, "o200k_base"))
@@ -80,14 +68,11 @@ def estimate_tokens(messages: list[dict], model: str) -> int:
 def chunk_comments_by_tokens(
    comments: list[dict], forum: dict | None, model: str
 ) -> list[list[dict]]:
-    """Split comments into chunks where each chunk fits under the model token limit."""
-    raw_limit = MODEL_LIMITS.get(model, _DEFAULT_TOKEN_LIMIT)
-    token_limit = int(raw_limit * _LIMIT_BUFFER)
-
+    """Greedy bin-pack comments into chunks that fit under the model TPD limit."""
+    token_limit = int(MODEL_LIMITS.get(model, _DEFAULT_TOKEN_LIMIT) * _LIMIT_BUFFER)
    chunks: list[list[dict]] = []
    current: list[dict] = []
    current_tokens = 0
-
    for comment in comments:
        messages, _ = build_messages(comment, forum)
        tokens = estimate_tokens(messages, model)
@@ -98,10 +83,8 @@ def chunk_comments_by_tokens(
        else:
            current.append(comment)
            current_tokens += tokens
-
    if current:
        chunks.append(current)
-
    return chunks


@@ -114,11 +97,11 @@ PROMPT_VERSION = hashlib.sha256(SYSTEM_PROMPT.encode("utf-8")).hexdigest()[:7]


 def _load_prompt(path: Path) -> None:
-    """Re-read a prompt file, updating module-level SYSTEM_PROMPT and PROMPT_VERSION."""
    global SYSTEM_PROMPT, PROMPT_VERSION
    SYSTEM_PROMPT = path.read_text(encoding="utf-8").strip()
    PROMPT_VERSION = hashlib.sha256(SYSTEM_PROMPT.encode("utf-8")).hexdigest()[:7]

+
 USER_TEMPLATE = """\
 ## Proposed Regulation
 Title: {reg_title}
@@ -141,17 +124,15 @@ MAX_COMMENT_CHARS = 6000
 # ---------------------------------------------------------------------------
 # Directories

-_SCRIPT_DIR  = Path(__file__).parent
-REQUESTS_DIR = _SCRIPT_DIR / "requests"
-RAW_DIR      = _SCRIPT_DIR / "raw"
-RUNS_DIR     = _SCRIPT_DIR / "runs"
+_SCRIPT_DIR = Path(__file__).parent
+JOBS_DIR = _SCRIPT_DIR / "jobs"

 # ---------------------------------------------------------------------------
 # Core functions (importable for tests)


 def load_items(path: Path) -> tuple[dict | None, list[dict]]:
-    """Read a scraped JSONL file. Returns (forum_item_or_None, [comment_items])."""
+    """Read a scraped JSONL. Returns (forum_item_or_None, [comment_items])."""
    forum = None
    comments = []
    with open(path, encoding="utf-8") as f:
@@ -172,7 +153,6 @@ def custom_id_from(comment_id: str) -> str:


 def parse_custom_id(custom_id: str) -> str:
-    """Return comment_id from a custom_id string."""
    return custom_id.removeprefix("comment_")


@@ -180,7 +160,6 @@ def build_messages(comment: dict, forum: dict | None) -> tuple[list, bool]:
    """Build OpenAI messages for one comment. Returns (messages, truncated)."""
    reg_title = (forum or {}).get("reg_title", "[unknown]")
    reg_desc  = (forum or {}).get("reg_desc",  "[unknown]")
-
    body = (comment.get("text") or "").strip()
    truncated = False
    if not body:
@@ -188,7 +167,6 @@ def build_messages(comment: dict, forum: dict | None) -> tuple[list, bool]:
    elif len(body) > MAX_COMMENT_CHARS:
        body = body[:MAX_COMMENT_CHARS] + "... [truncated]"
        truncated = True
-
    user_text = USER_TEMPLATE.format(
        reg_title=reg_title,
        reg_desc=reg_desc,
@@ -196,7 +174,6 @@ def build_messages(comment: dict, forum: dict | None) -> tuple[list, bool]:
        comment_title=comment.get("title", ""),
        comment_text=body,
    )
-
    return [
        {"role": "system", "content": SYSTEM_PROMPT},
        {"role": "user",   "content": user_text},
@@ -204,7 +181,6 @@ def build_messages(comment: dict, forum: dict | None) -> tuple[list, bool]:


 def build_batch_request_line(comment: dict, forum: dict | None, model: str) -> dict:
-    """Build one line of the batch input JSONL."""
    messages, _ = build_messages(comment, forum)
    return {
        "custom_id": custom_id_from(comment["comment_id"]),
@@ -227,14 +203,9 @@ def normalize_output_line(
    model: str,
    prompt_version: str,
 ) -> dict:
-    """Convert one raw batch output line into a normalized analysis record.
-
-    comment_lookup: {comment_id: CommentItem dict}
-    prompt_version: taken from the run manifest so it reflects what was submitted.
-    """
+    """Convert one raw batch output line into a normalized analysis record."""
    comment_id = parse_custom_id(raw_line.get("custom_id", ""))
    comment = comment_lookup.get(comment_id, {})
-
    base = {
        "run_id":         run_id,
        "forum_id":       comment.get("forum_id", ""),
@@ -245,20 +216,16 @@ def normalize_output_line(
        "input_title":    comment.get("title", ""),
        "truncated":      len(comment.get("text") or "") > MAX_COMMENT_CHARS,
    }
-
-    # Check for outer-level batch error (e.g. batch_expired)
    if raw_line.get("error"):
        err = raw_line["error"]
        err_msg = err.get("message", str(err)) if isinstance(err, dict) else str(err)
        return {**base, "stance": None, "stance_confidence": None,
                "stance_rationale": None, "tone": None, "tags": None, "error": err_msg}
-
    response = raw_line.get("response") or {}
    if response.get("status_code") != 200:
        return {**base, "stance": None, "stance_confidence": None,
                "stance_rationale": None, "tone": None, "tags": None,
                "error": f"status {response.get('status_code')}"}
-
    try:
        content = response["body"]["choices"][0]["message"]["content"]
        data = json.loads(content)
@@ -270,278 +237,372 @@ def normalize_output_line(
                "stance_rationale": None, "tone": None, "tags": None, "error": str(exc)}


-def make_manifest(
-    run_id: str,
-    input_filename: str,
-    input_sha256: str,
-    model: str,
-    batch_id: str,
-    records_submitted: int,
-    request_filename: str,
-) -> dict:
-    return {
-        "run_id":                 run_id,
-        "input_filename":         input_filename,
-        "input_sha256":           input_sha256,
-        "prompt_hash":            PROMPT_VERSION,
-        "model":                  model,
-        "batch_id":               batch_id,
-        "records_submitted":      records_submitted,
-        "records_completed":      None,
-        "records_failed":         None,
-        "request_filename":       request_filename,
-        "raw_output_filename":    None,
-        "normalized_output_filename": None,
-        "created_at":             datetime.now(timezone.utc).isoformat(),
-        "completed_at":           None,
+# ---------------------------------------------------------------------------
+# Job directory management
+
+
+def _next_job_dir(stem: str) -> Path:
+    base = stem[:8]
+    i = 1
+    while (JOBS_DIR / f"{base}-{i}").exists():
+        i += 1
+    return JOBS_DIR / f"{base}-{i}"
+
+
+def _latest_job_dir() -> Path:
+    if not JOBS_DIR.exists():
+        sys.exit(f"No jobs directory found. Run 'create' first.")
+    status_files = list(JOBS_DIR.glob("*/status.json"))
+    if not status_files:
+        sys.exit(f"No jobs found in {JOBS_DIR}. Run 'create' first.")
+    return max(status_files, key=lambda p: p.stat().st_mtime).parent
+
+
+def _resolve_job_dir(args) -> Path:
+    if getattr(args, "dir", None):
+        d = Path(args.dir)
+        if not d.is_absolute():
+            d = JOBS_DIR / d
+        if not d.exists():
+            sys.exit(f"Job directory not found: {d}")
+        return d
+    return _latest_job_dir()
+
+
+def load_status(job_dir: Path) -> dict:
+    return json.loads((job_dir / "status.json").read_text(encoding="utf-8"))
+
+
+def save_status(status: dict, job_dir: Path) -> None:
+    (job_dir / "status.json").write_text(
+        json.dumps(status, indent=2, ensure_ascii=False), encoding="utf-8"
+    )
+
+
+def _find_next_eligible_job(jobs: list[dict]) -> tuple[dict | None, str | None]:
+    """Return (next_pending_job, None) or (None, warning_message).
+
+    A job is eligible when it is 'pending' and either it is the first job
+    or its predecessor is 'completed'.
+    """
+    for j in jobs:
+        if j["status"] != "pending":
+            continue
+        if j["job_num"] == 1:
+            return j, None
+        prev = next(p for p in jobs if p["job_num"] == j["job_num"] - 1)
+        if prev["status"] == "completed":
+            return j, None
+        if prev["status"] in ("submitted", "in_progress", "validating", "finalizing"):
+            return None, (
+                f"Job {prev['job_num']} is '{prev['status']}'. "
+                f"Wait for it to complete before submitting job {j['job_num']}."
+            )
+    return None, None
+
+
+# ---------------------------------------------------------------------------
+# Subcommand: create
+
+
+def cmd_create(args) -> None:
+    report_path = Path(args.report)
+    if not report_path.exists():
+        sys.exit(f"Report not found: {report_path}")
+
+    report = json.loads(report_path.read_text(encoding="utf-8"))
+
+    if args.model not in report or not isinstance(report[args.model], dict):
+        available = [k for k in report if isinstance(report.get(k), dict)]
+        sys.exit(f"Model '{args.model}' not in report. Available: {', '.join(available)}")
+
+    prompt_path = Path(report["prompt"])
+    if not prompt_path.exists():
+        sys.exit(f"Prompt file not found: {prompt_path}")
+    _load_prompt(prompt_path)
+
+    input_path = Path(report["input_file"])
+    if not input_path.exists():
+        sys.exit(f"Input file not found: {input_path}")
+    forum, comments = load_items(input_path)
+    if not comments:
+        sys.exit("No comment items found in input file.")
+
+    chunks = chunk_comments_by_tokens(comments, forum, args.model)
+
+    stem = input_path.stem[:8]
+    job_dir = _next_job_dir(stem)
+    JOBS_DIR.mkdir(parents=True, exist_ok=True)
+    job_dir.mkdir()
+
+    shutil.copy2(input_path, job_dir / "forum.jsonl")
+    shutil.copy2(prompt_path, job_dir / "prompt.txt")
+    shutil.copy2(report_path, job_dir / "report.json")
+
+    jobs_meta = []
+    for i, chunk in enumerate(chunks, start=1):
+        req_path = job_dir / f"job{i}-input.jsonl"
+        with open(req_path, "w", encoding="utf-8") as f:
+            for comment in chunk:
+                f.write(json.dumps(build_batch_request_line(comment, forum, args.model),
+                                   ensure_ascii=False) + "\n")
+        jobs_meta.append({
+            "job_num": i,
+            "run_id": str(uuid.uuid4()),
+            "status": "pending",
+            "batch_id": None,
+            "records_submitted": len(chunk),
+            "records_completed": None,
+            "records_failed": None,
+            "submitted_at": None,
+            "completed_at": None,
+        })
+
+    model_info = report[args.model]
+    status = {
+        "model": args.model,
+        "prompt_hash": report["prompt_hash"],
+        "input_file": str(input_path),
+        "input_sha256": report["input_sha256"],
+        "total_comments": report["total_comments"],
+        "input_tokens": report["input_tokens"],
+        "est_queue_days": model_info["est_queue_days"],
+        "cost_$": model_info["cost_$"],
+        "total_jobs": len(chunks),
+        "jobs": jobs_meta,
    }
+    save_status(status, job_dir)

-
-def _latest_run_id() -> str:
-    """Return the run_id of the most recently saved manifest, or exit if none found."""
-    runs = list(RUNS_DIR.glob("*.json")) if RUNS_DIR.exists() else []
-    if not runs:
-        sys.exit(f"No runs found in {RUNS_DIR}. Submit a batch first.")
-    latest = max(runs, key=lambda p: p.stat().st_mtime)
-    return latest.stem
-
-
-def load_manifest(run_id: str) -> dict:
-    path = RUNS_DIR / f"{run_id}.json"
-    return json.loads(path.read_text(encoding="utf-8"))
-
-
-def save_manifest(manifest: dict) -> None:
-    RUNS_DIR.mkdir(parents=True, exist_ok=True)
-    path = RUNS_DIR / f"{manifest['run_id']}.json"
-    path.write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
+    print(f"Created: {job_dir.name}")
+    print(f"  {len(chunks)} job(s)  |  {len(comments)} comments  |  model: {args.model}")
+    print(f"\nNext:  python analysis/gpt4o/analysis_batch.py submit")


 # ---------------------------------------------------------------------------
 # Subcommand: submit

-def _submit_chunk(
-    chunk: list[dict],
-    forum: dict | None,
-    input_path: Path,
-    input_sha256: str,
-    model: str,
-    client,
-    chunk_index: int,
-    total_chunks: int,
-) -> str:
-    """Upload and submit one chunk of comments. Returns the run_id."""
-    import uuid
-    run_id = str(uuid.uuid4())
-    label = f"chunk {chunk_index + 1}/{total_chunks}" if total_chunks > 1 else "single batch"

-    REQUESTS_DIR.mkdir(parents=True, exist_ok=True)
-    request_path = REQUESTS_DIR / f"{run_id}.jsonl"
-    with open(request_path, "w", encoding="utf-8") as f:
-        for comment in chunk:
-            line = build_batch_request_line(comment, forum, model)
-            f.write(json.dumps(line, ensure_ascii=False) + "\n")
+def cmd_submit(args, client) -> None:
+    job_dir = _resolve_job_dir(args)
+    status = load_status(job_dir)
+    jobs = status["jobs"]

-    print(f"[{label}] Wrote {len(chunk)} requests → {request_path}", file=sys.stderr)
+    if args.job:
+        target = next((j for j in jobs if j["job_num"] == args.job), None)
+        if target is None:
+            sys.exit(f"Job {args.job} not found in {job_dir.name}.")
+        if target["status"] != "pending":
+            sys.exit(f"Job {args.job} is already '{target['status']}' — cannot resubmit.")
+        if target["job_num"] > 1:
+            prev = next(p for p in jobs if p["job_num"] == target["job_num"] - 1)
+            if prev["status"] != "completed":
+                sys.exit(
+                    f"Cannot submit job {target['job_num']}: "
+                    f"job {prev['job_num']} is '{prev['status']}' (must be 'completed')."
+                )
+    else:
+        target, warning = _find_next_eligible_job(jobs)
+        if warning:
+            print(warning, file=sys.stderr)
+            sys.exit(1)
+        if target is None:
+            all_done = all(j["status"] == "completed" for j in jobs)
+            print("All jobs completed." if all_done else "No pending jobs eligible for submission.")
+            return

-    with open(request_path, "rb") as f:
+    n = target["job_num"]
+    req_path = job_dir / f"job{n}-input.jsonl"
+    print(f"Submitting job {n}/{status['total_jobs']} ({target['records_submitted']} comments) ...",
+          file=sys.stderr)
+
+    with open(req_path, "rb") as f:
        uploaded = client.files.create(file=f, purpose="batch")
-    print(f"[{label}] Uploaded: {uploaded.id}", file=sys.stderr)

    batch = client.batches.create(
        input_file_id=uploaded.id,
        endpoint="/v1/chat/completions",
        completion_window="24h",
-        metadata={"run_id": run_id, "input_filename": str(input_path)},
-    )
-    print(f"[{label}] Batch created: {batch.id}  status={batch.status}", file=sys.stderr)
-
-    manifest = make_manifest(
-        run_id=run_id,
-        input_filename=str(input_path),
-        input_sha256=input_sha256,
-        model=model,
-        batch_id=batch.id,
-        records_submitted=len(chunk),
-        request_filename=str(request_path),
-    )
-    save_manifest(manifest)
-    return run_id
-
-
-def cmd_submit(args, client) -> None:
-    _load_prompt(Path(args.prompt))
-    print(f"Prompt: {args.prompt}  (version {PROMPT_VERSION})", file=sys.stderr)
-
-    input_path = Path(args.input)
-    if not input_path.exists():
-        sys.exit(f"File not found: {input_path}")
-
-    print(f"Reading {input_path} ...", file=sys.stderr)
-    forum, comments = load_items(input_path)
-    if not comments:
-        sys.exit("No comment items found in input file.")
-    if forum is None:
-        print("Warning: no ForumItem found — regulation context will be [unknown].", file=sys.stderr)
-
-    if args.limit:
-        comments = comments[:args.limit]
-        print(f"Limiting to {len(comments)} comments (--limit {args.limit}).", file=sys.stderr)
-
-    token_limit = int(MODEL_LIMITS.get(args.model, _DEFAULT_TOKEN_LIMIT) * _LIMIT_BUFFER)
-    chunks = chunk_comments_by_tokens(comments, forum, args.model)
-    total = len(chunks)
-    print(
-        f"Model: {args.model}  token limit: {token_limit:,}  "
-        f"→ {len(comments)} comments split into {total} chunk(s).",
-        file=sys.stderr,
+        metadata={"run_id": target["run_id"], "job_dir": job_dir.name},
    )

-    input_sha256 = hashlib.sha256(input_path.read_bytes()).hexdigest()
+    target["status"] = "submitted"
+    target["batch_id"] = batch.id
+    target["submitted_at"] = datetime.now(timezone.utc).isoformat()
+    save_status(status, job_dir)

-    # Submit only the first chunk — the enqueued token limit is a TOTAL across all
-    # concurrent batches, so stacking multiple submissions will exceed the quota.
-    # Wait for each batch to complete before submitting the next.
-    run_id = _submit_chunk(chunks[0], forum, input_path, input_sha256, args.model, client, 0, total)
-
-    print(f"\nBatch 1/{total} submitted.", file=sys.stderr)
-    print(f"  status:   python analysis/gpt4o/analysis_batch.py status {run_id}", file=sys.stderr)
-    print(f"  download: python analysis/gpt4o/analysis_batch.py download {run_id}", file=sys.stderr)
-
-    if total > 1:
-        remaining = sum(len(c) for c in chunks[1:])
-        print(f"\n{total - 1} more chunk(s) remaining ({remaining} comments).", file=sys.stderr)
-        print("After this batch completes and is downloaded, rerun submit with --limit to get the next chunk:", file=sys.stderr)
-        offset = len(chunks[0])
-        for idx, chunk in enumerate(chunks[1:], start=2):
-            print(f"  chunk {idx}/{total}: comments {offset}–{offset + len(chunk) - 1}", file=sys.stderr)
-            offset += len(chunk)
-
-    print(run_id)  # stdout for scripting
+    print(f"Job {n} submitted: {batch.id}  ({batch.status})")
+    print(f"  python analysis/gpt4o/analysis_batch.py status")


 # ---------------------------------------------------------------------------
 # Subcommand: status

+
 def cmd_status(args, client) -> None:
-    run_id = args.run_id or _latest_run_id()
-    if not args.run_id:
-        print(f"(using latest run: {run_id})", file=sys.stderr)
-    manifest = load_manifest(run_id)
-    batch = client.batches.retrieve(manifest["batch_id"])
+    job_dir = _resolve_job_dir(args)
+    status = load_status(job_dir)
+    jobs = status["jobs"]

-    counts = batch.request_counts
-    print(f"status:     {batch.status}")
-    print(f"completed:  {counts.completed}/{counts.total}")
-    print(f"failed:     {counts.failed}")
+    job_filter = getattr(args, "job", None)

-    manifest["records_completed"] = counts.completed
-    manifest["records_failed"]    = counts.failed
-    save_manifest(manifest)
+    for job in jobs:
+        if job_filter is not None and job["job_num"] != job_filter:
+            continue
+        if not job["batch_id"]:
+            continue
+        if job["status"] in ("completed", "failed", "expired", "cancelled", "pending"):
+            continue
+        batch = client.batches.retrieve(job["batch_id"])
+        counts = batch.request_counts
+        if batch.status == "completed":
+            job["status"] = "completed"
+            if batch.completed_at:
+                job["completed_at"] = datetime.fromtimestamp(
+                    batch.completed_at, tz=timezone.utc
+                ).isoformat()
+        elif batch.status in ("failed", "expired", "cancelled"):
+            job["status"] = batch.status
+        else:
+            job["status"] = batch.status
+        job["records_completed"] = counts.completed
+        job["records_failed"] = counts.failed

-    if batch.status == "completed":
-        print(f"\nReady to download. Run:")
-        print(f"  python analysis/gpt4o/analysis_batch.py download {run_id}")
+    save_status(status, job_dir)
+
+    target_jobs = jobs if not job_filter else [j for j in jobs if j["job_num"] == job_filter]
+    print(f"Dir: {job_dir.name}  |  Model: {status['model']}  |  {status['total_jobs']} job(s)")
+    print(f"{'Job':<5} {'Status':<14} {'Records':>12}  {'Submitted':<20}  {'Completed':<20}")
+    print("-" * 76)
+    for j in target_jobs:
+        rec = (f"{j['records_completed']}/{j['records_submitted']}"
+               if j["records_completed"] is not None else f"-/{j['records_submitted']}")
+        sub  = (j["submitted_at"]  or "-")[:19]
+        done = (j["completed_at"] or "-")[:19]
+        print(f"{j['job_num']:<5} {j['status']:<14} {rec:>12}  {sub:<20}  {done:<20}")


 # ---------------------------------------------------------------------------
 # Subcommand: download

+
 def cmd_download(args, client) -> None:
-    run_id = args.run_id or _latest_run_id()
-    if not args.run_id:
-        print(f"(using latest run: {run_id})", file=sys.stderr)
-    manifest = load_manifest(run_id)
-    batch = client.batches.retrieve(manifest["batch_id"])
+    job_dir = _resolve_job_dir(args)

-    if batch.status != "completed":
-        sys.exit(f"Batch not complete yet (status={batch.status}). Run 'status' to check.")
+    # Refresh status before deciding what to download
+    cmd_status(args, client)
+    status = load_status(job_dir)
+    jobs = status["jobs"]

-    run_id    = manifest["run_id"]
-    model     = manifest["model"]
-    model_slug = model.replace("/", "-")
+    job_filter = getattr(args, "job", None)
+    if job_filter:
+        candidates = [j for j in jobs if j["job_num"] == job_filter]
+    else:
+        candidates = [
+            j for j in jobs
+            if j["status"] == "completed"
+            and not (job_dir / f"job{j['job_num']}-output.jsonl").exists()
+        ]

-    # Download raw output
-    RAW_DIR.mkdir(parents=True, exist_ok=True)
-    raw_path = RAW_DIR / f"{run_id}.jsonl"
-    raw_text = client.files.content(batch.output_file_id).text
-    raw_path.write_text(raw_text, encoding="utf-8")
-    print(f"Raw output → {raw_path}", file=sys.stderr)
+    if not candidates:
+        print("No completed jobs pending download.", file=sys.stderr)
+        return

-    # Build comment lookup from original input for reconciliation
-    input_path = Path(manifest["input_filename"])
-    _, comments = load_items(input_path)
-    comment_lookup = {c["comment_id"]: c for c in comments}
+    _, all_comments = load_items(job_dir / "forum.jsonl")
+    comment_lookup = {c["comment_id"]: c for c in all_comments}

-    # Normalize
-    completed_at = datetime.now(timezone.utc).isoformat()
-    if batch.completed_at:
-        completed_at = datetime.fromtimestamp(batch.completed_at, tz=timezone.utc).isoformat()
+    for job in candidates:
+        n = job["job_num"]

-    normalized_path = _SCRIPT_DIR / f"{run_id}_{model_slug}.jsonl"
-    n_ok = n_err = 0
-    with open(normalized_path, "w", encoding="utf-8") as out:
-        for line in raw_text.splitlines():
-            if not line.strip():
-                continue
-            raw_line = json.loads(line)
-            record = normalize_output_line(raw_line, comment_lookup, run_id, completed_at, model, manifest["prompt_hash"])
-            out.write(json.dumps(record, ensure_ascii=False) + "\n")
-            if record["error"]:
-                n_err += 1
-            else:
-                n_ok += 1
+        if job["status"] != "completed":
+            print(f"Job {n} not yet completed ('{job['status']}'), skipping.", file=sys.stderr)
+            continue

-    print(f"Normalized → {normalized_path}  ({n_ok} ok, {n_err} errors)", file=sys.stderr)
+        batch = client.batches.retrieve(job["batch_id"])

-    manifest["records_completed"]         = n_ok
-    manifest["records_failed"]            = n_err
-    manifest["raw_output_filename"]       = str(raw_path)
-    manifest["normalized_output_filename"] = str(normalized_path)
-    manifest["completed_at"]              = completed_at
-    save_manifest(manifest)
-    print(f"Manifest updated → {RUNS_DIR / run_id}.json", file=sys.stderr)
+        if not batch.output_file_id:
+            print(f"Job {n}: no output file available from OpenAI.", file=sys.stderr)
+            continue
+
+        raw_text = client.files.content(batch.output_file_id).text
+        raw_path = job_dir / f"job{n}-output-raw.jsonl"
+        raw_path.write_text(raw_text, encoding="utf-8")
+        print(f"Job {n} raw → {raw_path.name}", file=sys.stderr)
+
+        if batch.error_file_id:
+            err_text = client.files.content(batch.error_file_id).text
+            err_path = job_dir / f"job{n}-errors.jsonl"
+            err_path.write_text(err_text, encoding="utf-8")
+            n_err_lines = sum(1 for line in err_text.splitlines() if line.strip())
+            print(f"Job {n} errors → {err_path.name}  ({n_err_lines} lines)", file=sys.stderr)
+
+        completed_at = job.get("completed_at") or datetime.now(timezone.utc).isoformat()
+        norm_path = job_dir / f"job{n}-output.jsonl"
+        n_ok = n_err = 0
+        with open(norm_path, "w", encoding="utf-8") as out:
+            for line in raw_text.splitlines():
+                if not line.strip():
+                    continue
+                record = normalize_output_line(
+                    json.loads(line), comment_lookup,
+                    job["run_id"], completed_at,
+                    status["model"], status["prompt_hash"],
+                )
+                out.write(json.dumps(record, ensure_ascii=False) + "\n")
+                if record["error"]:
+                    n_err += 1
+                else:
+                    n_ok += 1
+
+        print(f"Job {n} normalized → {norm_path.name}  ({n_ok} ok, {n_err} errors)", file=sys.stderr)
+        job["records_completed"] = n_ok
+        job["records_failed"] = n_err
+
+    save_status(status, job_dir)


 # ---------------------------------------------------------------------------
 # CLI

+
+def _add_common_args(p: argparse.ArgumentParser) -> None:
+    p.add_argument("--job", type=int, default=None, metavar="N",
+                   help="Job number within the run (default: auto)")
+    p.add_argument("--dir", default=None, metavar="DIR",
+                   help="Job directory name or path (default: most recent)")
+
+
 def main() -> None:
    load_dotenv()

-    api_key = os.environ.get("OPENAI_API_KEY")
-    if not api_key:
-        sys.exit("OPENAI_API_KEY not set. Create a .env file or export the variable.")
-
    parser = argparse.ArgumentParser(
-        description="Public comment batch analysis pipeline.",
+        description="Batch analysis job runner.",
        formatter_class=argparse.RawDescriptionHelpFormatter,
        epilog=__doc__,
    )
    sub = parser.add_subparsers(dest="command", required=True)

-    p_submit = sub.add_parser("submit", help="Build and submit a batch job")
-    p_submit.add_argument("input", help="Path to scraped JSONL file")
-    p_submit.add_argument("--model", default="gpt-4o", help="OpenAI model (default: gpt-4o)")
-    p_submit.add_argument(
-        "--prompt",
-        default=str(_DEFAULT_PROMPT_FILE),
-        help="Path to system prompt file (default: analysis/prompt-1.txt)",
-    )
-    p_submit.add_argument(
-        "--limit", type=int, default=None, metavar="N",
-        help="Submit only the first N comments (useful for staying under token quota)",
-    )
+    p_create = sub.add_parser("create", help="Create job directory from tokenizer report")
+    p_create.add_argument("report", help="Path to report.json from tokenizer.py")
+    p_create.add_argument("--model", required=True, help="Model (e.g. gpt-4o-mini)")

-    p_status = sub.add_parser("status", help="Check batch status")
-    p_status.add_argument("run_id", nargs="?", default=None,
-                          help="run_id from submit (default: most recent run)")
+    p_submit = sub.add_parser("submit", help="Submit next eligible job")
+    _add_common_args(p_submit)

-    p_download = sub.add_parser("download", help="Download and normalize completed batch")
-    p_download.add_argument("run_id", nargs="?", default=None,
-                            help="run_id from submit (default: most recent run)")
+    p_status = sub.add_parser("status", help="Check job status")
+    _add_common_args(p_status)
+
+    p_download = sub.add_parser("download", help="Download and normalize completed jobs")
+    _add_common_args(p_download)

    args = parser.parse_args()
+
+    if args.command == "create":
+        cmd_create(args)
+        return
+
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        sys.exit("OPENAI_API_KEY not set. Create a .env file or export the variable.")
    client = openai.OpenAI(api_key=api_key)

    if args.command == "submit":
--- a/analysis/gpt4o/tokenizer.py
+++ b/analysis/gpt4o/tokenizer.py
@@ -0,0 +1,141 @@
+#!/usr/bin/env python3
+"""
+tokenizer.py — estimate token usage and cost for a batch analysis run.
+
+Usage:
+    python analysis/gpt4o/tokenizer.py output/f452.jsonl [--prompt analysis/prompt-1.txt]
+
+Prints a per-model comparison table and writes report.json next to the input file.
+Run this before analysis_batch.py create.
+"""
+
+import argparse
+import hashlib
+import json
+import math
+import sys
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent))
+import analysis_batch as _ab
+
+# Input pricing ($/1M tokens, batch API) — from docs/openai.md, updated 2026-05-05.
+# Add Anthropic/other models here when needed; only models with a LIMITS entry are reported.
+MODEL_PRICING: dict[str, float] = {
+    "gpt-5.5":       2.50,
+    "gpt-5.4":       1.25,
+    "gpt-5.4-mini":  0.375,
+    "gpt-5.4-nano":  0.10,
+    "gpt-4o":        1.25,
+    "gpt-4o-mini":   0.075,
+    "gpt-o4-mini":   0.55,
+}
+
+
+def compute_report(
+    comments: list[dict],
+    forum: dict | None,
+    prompt_hash: str,
+    input_file: str,
+    input_sha256: str,
+    prompt_file: str,
+) -> dict:
+    """Compute token estimate and per-model job/cost/time breakdown."""
+    # Use gpt-4o encoding as the canonical estimator (same for all current models)
+    total_tokens = sum(
+        _ab.estimate_tokens(_ab.build_messages(c, forum)[0], "gpt-4o")
+        for c in comments
+    )
+
+    report: dict = {
+        "prompt": prompt_file,
+        "prompt_hash": prompt_hash,
+        "input_file": input_file,
+        "input_sha256": input_sha256,
+        "total_comments": len(comments),
+        "input_tokens": total_tokens,
+    }
+
+    for model, tpd in _ab.MODEL_LIMITS.items():
+        effective_tpd = int(tpd * _ab._LIMIT_BUFFER)
+        jobs = math.ceil(total_tokens / effective_tpd)
+        cost = round(total_tokens / 1_000_000 * MODEL_PRICING.get(model, 0.0), 4)
+        est_days = round(total_tokens / tpd, 2)
+        report[model] = {"jobs": jobs, "cost_$": cost, "est_queue_days": est_days}
+
+    return report
+
+
+def print_table(report: dict) -> None:
+    """Print a human-readable model comparison table to stdout."""
+    print(f"\nInput:    {report['input_file']}")
+    print(f"Comments: {report['total_comments']:,}")
+    print(f"Tokens:   {report['input_tokens']:,}")
+    print(f"Prompt:   {report['prompt']}  (hash: {report['prompt_hash']})")
+    print()
+
+    # Cheapest model that fits in one job
+    single_job_models = [m for m in _ab.MODEL_LIMITS if report.get(m, {}).get("jobs") == 1]
+    best = (min(single_job_models, key=lambda m: report[m]["cost_$"])
+            if single_job_models else None)
+
+    print(f"{'Model':<15} {'Jobs':>5}  {'Cost ($)':>9}  {'Est days':>9}  {'Note'}")
+    print("-" * 62)
+    for model in _ab.MODEL_LIMITS:
+        if model not in report or not isinstance(report[model], dict):
+            continue
+        m = report[model]
+        note = "<-- recommended" if model == best else ""
+        print(f"{model:<15} {m['jobs']:>5}  {m['cost_$']:>9.4f}  {m['est_queue_days']:>9.2f}  {note}")
+    print()
+
+
+def main() -> None:
+    _default_prompt = Path(__file__).parent.parent / "prompt-1.txt"
+
+    parser = argparse.ArgumentParser(description="Estimate batch token usage and cost.")
+    parser.add_argument("input", help="Scraped JSONL file")
+    parser.add_argument(
+        "--prompt",
+        default=str(_default_prompt),
+        help=f"System prompt file (default: {_default_prompt.name})",
+    )
+    args = parser.parse_args()
+
+    input_path = Path(args.input)
+    if not input_path.exists():
+        sys.exit(f"File not found: {input_path}")
+
+    prompt_path = Path(args.prompt)
+    if not prompt_path.exists():
+        sys.exit(f"Prompt file not found: {prompt_path}")
+
+    prompt_text = prompt_path.read_text(encoding="utf-8").strip()
+    prompt_hash = hashlib.sha256(prompt_text.encode("utf-8")).hexdigest()[:7]
+
+    # Ensure build_messages uses the specified prompt
+    _ab._load_prompt(prompt_path)
+
+    forum, comments = _ab.load_items(input_path)
+    if not comments:
+        sys.exit("No comment items found.")
+    if forum is None:
+        print("Warning: no ForumItem — token estimates may be slightly low.", file=sys.stderr)
+
+    input_sha256 = hashlib.sha256(input_path.read_bytes()).hexdigest()
+
+    report = compute_report(
+        comments, forum, prompt_hash,
+        str(input_path), input_sha256, str(prompt_path),
+    )
+
+    print_table(report)
+
+    out_path = input_path.parent / "report.json"
+    out_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
+    print(f"Report written to: {out_path}")
+    print(f"\nNext:  python analysis/gpt4o/analysis_batch.py create {out_path} --model <model>")
+
+
+if __name__ == "__main__":
+    main()