cleanup

refactor/batch-openai prep
2026-05-06 13:30:53 -04:00 · 2026-05-06 13:29:59 -04:00
10 changed files with 468 additions and 9165 deletions
--- a/analysis/gpt4o/analysis_batch.py
+++ b/analysis/gpt4o/analysis_batch.py
@@ -1,11 +1,14 @@
 #!/usr/bin/env python3
 """
-analysis/gpt4o/analysis-batch.py — OpenAI Batch API pipeline
+analysis_batch.py — OpenAI Batch API pipeline

 Commands (run manually in order):
-    submit   <input_jsonl> [--model gpt-4o]   — build request file, upload, create batch
-    status   <run_id>                          — check batch status, update manifest
-    download <run_id>                          — download + normalize output, update manifest
+    submit   <input_jsonl> [--model gpt-4o] [--limit N]
+                                           — build request file, upload, create batch
+    status   [run_id]                      — check batch status, update manifest
+    download [run_id]                      — download + normalize output, update manifest
+
+run_id defaults to the most recent run in runs/ when omitted.

 File layout (all under analysis/gpt4o/):
    requests/<run_id>.jsonl     — batch input sent to OpenAI
@@ -29,6 +32,79 @@ try:
 except ImportError:
    sys.exit("openai package not installed. Run: pip install openai")

+# ---------------------------------------------------------------------------
+# Model limits and token estimation
+
+# Max enqueued tokens across ALL concurrent batches for this model
+# (docs/openai.md pricing table, updated 2026-05-05).
+# NOTE: your org tier may be lower — if a submit fails, use --limit to reduce chunk size.
+MODEL_LIMITS: dict[str, int] = {
+    "gpt-5.5":        900_000,
+    "gpt-5.4":        900_000,
+    "gpt-5.4-mini": 2_000_000,
+    "gpt-5.4-nano":   200_000,
+    "gpt-4o":         900_000,
+    "gpt-4o-mini":  2_000_000,
+    "gpt-o4-mini":  2_000_000,
+}
+_DEFAULT_TOKEN_LIMIT = 900_000
+
+# tiktoken encoding per model family; unknown models fall back to o200k_base
+_MODEL_ENCODING: dict[str, str] = {
+    "gpt-5.5":       "o200k_base",
+    "gpt-5.4":       "o200k_base",
+    "gpt-5.4-mini":  "o200k_base",
+    "gpt-5.4-nano":  "o200k_base",
+    "gpt-4o":        "o200k_base",
+    "gpt-4o-mini":   "o200k_base",
+    "gpt-o4-mini":   "o200k_base",
+}
+# Leave 10% headroom below the published limit
+_LIMIT_BUFFER = 0.90
+
+
+def estimate_tokens(messages: list[dict], model: str) -> int:
+    """Estimate token count for a messages list.
+
+    Uses tiktoken when available (exact for OpenAI models); falls back to
+    chars/3 + 4-token overhead per message for unknown/Anthropic models.
+    """
+    try:
+        import tiktoken
+        enc = tiktoken.get_encoding(_MODEL_ENCODING.get(model, "o200k_base"))
+        return sum(4 + len(enc.encode(m["content"])) for m in messages)
+    except ImportError:
+        return sum(4 + len(m["content"]) // 3 for m in messages)
+
+
+def chunk_comments_by_tokens(
+    comments: list[dict], forum: dict | None, model: str
+) -> list[list[dict]]:
+    """Split comments into chunks where each chunk fits under the model token limit."""
+    raw_limit = MODEL_LIMITS.get(model, _DEFAULT_TOKEN_LIMIT)
+    token_limit = int(raw_limit * _LIMIT_BUFFER)
+
+    chunks: list[list[dict]] = []
+    current: list[dict] = []
+    current_tokens = 0
+
+    for comment in comments:
+        messages, _ = build_messages(comment, forum)
+        tokens = estimate_tokens(messages, model)
+        if current and current_tokens + tokens > token_limit:
+            chunks.append(current)
+            current = [comment]
+            current_tokens = tokens
+        else:
+            current.append(comment)
+            current_tokens += tokens
+
+    if current:
+        chunks.append(current)
+
+    return chunks
+
+
 # ---------------------------------------------------------------------------
 # Prompt

@@ -221,6 +297,15 @@ def make_manifest(
    }


+def _latest_run_id() -> str:
+    """Return the run_id of the most recently saved manifest, or exit if none found."""
+    runs = list(RUNS_DIR.glob("*.json")) if RUNS_DIR.exists() else []
+    if not runs:
+        sys.exit(f"No runs found in {RUNS_DIR}. Submit a batch first.")
+    latest = max(runs, key=lambda p: p.stat().st_mtime)
+    return latest.stem
+
+
 def load_manifest(run_id: str) -> dict:
    path = RUNS_DIR / f"{run_id}.json"
    return json.loads(path.read_text(encoding="utf-8"))
@@ -235,6 +320,55 @@ def save_manifest(manifest: dict) -> None:
 # ---------------------------------------------------------------------------
 # Subcommand: submit

+def _submit_chunk(
+    chunk: list[dict],
+    forum: dict | None,
+    input_path: Path,
+    input_sha256: str,
+    model: str,
+    client,
+    chunk_index: int,
+    total_chunks: int,
+) -> str:
+    """Upload and submit one chunk of comments. Returns the run_id."""
+    import uuid
+    run_id = str(uuid.uuid4())
+    label = f"chunk {chunk_index + 1}/{total_chunks}" if total_chunks > 1 else "single batch"
+
+    REQUESTS_DIR.mkdir(parents=True, exist_ok=True)
+    request_path = REQUESTS_DIR / f"{run_id}.jsonl"
+    with open(request_path, "w", encoding="utf-8") as f:
+        for comment in chunk:
+            line = build_batch_request_line(comment, forum, model)
+            f.write(json.dumps(line, ensure_ascii=False) + "\n")
+
+    print(f"[{label}] Wrote {len(chunk)} requests → {request_path}", file=sys.stderr)
+
+    with open(request_path, "rb") as f:
+        uploaded = client.files.create(file=f, purpose="batch")
+    print(f"[{label}] Uploaded: {uploaded.id}", file=sys.stderr)
+
+    batch = client.batches.create(
+        input_file_id=uploaded.id,
+        endpoint="/v1/chat/completions",
+        completion_window="24h",
+        metadata={"run_id": run_id, "input_filename": str(input_path)},
+    )
+    print(f"[{label}] Batch created: {batch.id}  status={batch.status}", file=sys.stderr)
+
+    manifest = make_manifest(
+        run_id=run_id,
+        input_filename=str(input_path),
+        input_sha256=input_sha256,
+        model=model,
+        batch_id=batch.id,
+        records_submitted=len(chunk),
+        request_filename=str(request_path),
+    )
+    save_manifest(manifest)
+    return run_id
+
+
 def cmd_submit(args, client) -> None:
    _load_prompt(Path(args.prompt))
    print(f"Prompt: {args.prompt}  (version {PROMPT_VERSION})", file=sys.stderr)
@@ -250,49 +384,39 @@ def cmd_submit(args, client) -> None:
    if forum is None:
        print("Warning: no ForumItem found — regulation context will be [unknown].", file=sys.stderr)

-    import uuid
-    run_id = str(uuid.uuid4())
+    if args.limit:
+        comments = comments[:args.limit]
+        print(f"Limiting to {len(comments)} comments (--limit {args.limit}).", file=sys.stderr)
+
+    token_limit = int(MODEL_LIMITS.get(args.model, _DEFAULT_TOKEN_LIMIT) * _LIMIT_BUFFER)
+    chunks = chunk_comments_by_tokens(comments, forum, args.model)
+    total = len(chunks)
+    print(
+        f"Model: {args.model}  token limit: {token_limit:,}  "
+        f"→ {len(comments)} comments split into {total} chunk(s).",
+        file=sys.stderr,
+    )
+
    input_sha256 = hashlib.sha256(input_path.read_bytes()).hexdigest()

-    # Build batch request file
-    REQUESTS_DIR.mkdir(parents=True, exist_ok=True)
-    request_path = REQUESTS_DIR / f"{run_id}.jsonl"
-    with open(request_path, "w", encoding="utf-8") as f:
-        for comment in comments:
-            line = build_batch_request_line(comment, forum, args.model)
-            f.write(json.dumps(line, ensure_ascii=False) + "\n")
+    # Submit only the first chunk — the enqueued token limit is a TOTAL across all
+    # concurrent batches, so stacking multiple submissions will exceed the quota.
+    # Wait for each batch to complete before submitting the next.
+    run_id = _submit_chunk(chunks[0], forum, input_path, input_sha256, args.model, client, 0, total)

-    print(f"Wrote {len(comments)} requests → {request_path}", file=sys.stderr)
+    print(f"\nBatch 1/{total} submitted.", file=sys.stderr)
+    print(f"  status:   python analysis/gpt4o/analysis_batch.py status {run_id}", file=sys.stderr)
+    print(f"  download: python analysis/gpt4o/analysis_batch.py download {run_id}", file=sys.stderr)

-    # Upload to OpenAI
-    print("Uploading request file ...", file=sys.stderr)
-    with open(request_path, "rb") as f:
-        uploaded = client.files.create(file=f, purpose="batch")
-    print(f"Uploaded: {uploaded.id}", file=sys.stderr)
+    if total > 1:
+        remaining = sum(len(c) for c in chunks[1:])
+        print(f"\n{total - 1} more chunk(s) remaining ({remaining} comments).", file=sys.stderr)
+        print("After this batch completes and is downloaded, rerun submit with --limit to get the next chunk:", file=sys.stderr)
+        offset = len(chunks[0])
+        for idx, chunk in enumerate(chunks[1:], start=2):
+            print(f"  chunk {idx}/{total}: comments {offset}–{offset + len(chunk) - 1}", file=sys.stderr)
+            offset += len(chunk)

-    # Create batch
-    batch = client.batches.create(
-        input_file_id=uploaded.id,
-        endpoint="/v1/chat/completions",
-        completion_window="24h",
-        metadata={"run_id": run_id, "input_filename": str(input_path)},
-    )
-    print(f"Batch created: {batch.id}  status={batch.status}", file=sys.stderr)
-
-    # Save manifest
-    manifest = make_manifest(
-        run_id=run_id,
-        input_filename=str(input_path),
-        input_sha256=input_sha256,
-        model=args.model,
-        batch_id=batch.id,
-        records_submitted=len(comments),
-        request_filename=str(request_path),
-    )
-    save_manifest(manifest)
-
-    print(f"\nrun_id: {run_id}", file=sys.stderr)
-    print(f"Check status:   python analysis/gpt4o/analysis-batch.py status {run_id}", file=sys.stderr)
    print(run_id)  # stdout for scripting


@@ -300,7 +424,10 @@ def cmd_submit(args, client) -> None:
 # Subcommand: status

 def cmd_status(args, client) -> None:
-    manifest = load_manifest(args.run_id)
+    run_id = args.run_id or _latest_run_id()
+    if not args.run_id:
+        print(f"(using latest run: {run_id})", file=sys.stderr)
+    manifest = load_manifest(run_id)
    batch = client.batches.retrieve(manifest["batch_id"])

    counts = batch.request_counts
@@ -314,14 +441,17 @@ def cmd_status(args, client) -> None:

    if batch.status == "completed":
        print(f"\nReady to download. Run:")
-        print(f"  python analysis/gpt4o/analysis-batch.py download {args.run_id}")
+        print(f"  python analysis/gpt4o/analysis_batch.py download {run_id}")


 # ---------------------------------------------------------------------------
 # Subcommand: download

 def cmd_download(args, client) -> None:
-    manifest = load_manifest(args.run_id)
+    run_id = args.run_id or _latest_run_id()
+    if not args.run_id:
+        print(f"(using latest run: {run_id})", file=sys.stderr)
+    manifest = load_manifest(run_id)
    batch = client.batches.retrieve(manifest["batch_id"])

    if batch.status != "completed":
@@ -398,12 +528,18 @@ def main() -> None:
        default=str(_DEFAULT_PROMPT_FILE),
        help="Path to system prompt file (default: analysis/prompt-1.txt)",
    )
+    p_submit.add_argument(
+        "--limit", type=int, default=None, metavar="N",
+        help="Submit only the first N comments (useful for staying under token quota)",
+    )

    p_status = sub.add_parser("status", help="Check batch status")
-    p_status.add_argument("run_id", help="run_id from submit output")
+    p_status.add_argument("run_id", nargs="?", default=None,
+                          help="run_id from submit (default: most recent run)")

    p_download = sub.add_parser("download", help="Download and normalize completed batch")
-    p_download.add_argument("run_id", help="run_id from submit output")
+    p_download.add_argument("run_id", nargs="?", default=None,
+                            help="run_id from submit (default: most recent run)")

    args = parser.parse_args()
    client = openai.OpenAI(api_key=api_key)
--- a/analysis/gpt4o/forum452_unknown_gpt-4o_2026-05-05T18-48-32+00-00.jsonl
+++ b/analysis/gpt4o/forum452_unknown_gpt-4o_2026-05-05T18-48-32+00-00.jsonl
@@ -1,5 +0,0 @@
-{"run_id": "a513d1c5-88a4-40b6-8d60-be255908a157", "forum_id": "452", "comment_id": "87914", "analyzed_at": "2026-05-05T18:48:32.792363+00:00", "model": "gpt-4o", "prompt_version": "17ff34f", "input_title": "Support the Model Policy Wholeheartedly", "stance": "support", "stance_confidence": 1.0, "stance_rationale": "The commenter explicitly states 'I support the model policy wholeheartedly' and praises the policy for creating inclusive and welcoming schools for transgender and non-binary students. They also express gratitude towards the Virginia Department of Education for developing the policy.", "tone": "positive", "tags": ["LGBTQ+ inclusion", "student safety", "privacy", "bullying prevention"], "truncated": false, "error": null}
-{"run_id": "a513d1c5-88a4-40b6-8d60-be255908a157", "forum_id": "452", "comment_id": "87915", "analyzed_at": "2026-05-05T18:48:37.398155+00:00", "model": "gpt-4o", "prompt_version": "17ff34f", "input_title": "Please support this vital policy", "stance": "support", "stance_confidence": 1.0, "stance_rationale": "The commenter explicitly states, 'I strongly support these proposals,' indicating clear approval of the regulation. They also affirm the importance of treating every student with dignity and respect, aligning with the policy's goals.", "tone": "positive", "tags": ["LGBTQ+ inclusion", "student safety", "nondiscrimination"], "truncated": false, "error": null}
-{"run_id": "a513d1c5-88a4-40b6-8d60-be255908a157", "forum_id": "452", "comment_id": "87916", "analyzed_at": "2026-05-05T18:48:41.236389+00:00", "model": "gpt-4o", "prompt_version": "17ff34f", "input_title": "Please support this policy", "stance": "support", "stance_confidence": 1.0, "stance_rationale": "The commenter explicitly states, \"I am in full support of this policy guidance,\" indicating clear support for the regulation. The phrase \"Trans rights are human rights\" further reinforces their supportive stance.", "tone": "positive", "tags": ["transgender rights", "nondiscrimination"], "truncated": false, "error": null}
-{"run_id": "a513d1c5-88a4-40b6-8d60-be255908a157", "forum_id": "452", "comment_id": "87917", "analyzed_at": "2026-05-05T18:48:44.321705+00:00", "model": "gpt-4o", "prompt_version": "17ff34f", "input_title": "Please support this policy", "stance": "support", "stance_confidence": 0.95, "stance_rationale": "The commenter explicitly states 'Please support this policy' and 'Please implement this policy,' indicating a clear support for the adoption of the regulation.", "tone": "positive", "tags": ["transgender rights", "student safety", "nondiscrimination"], "truncated": false, "error": null}
-{"run_id": "a513d1c5-88a4-40b6-8d60-be255908a157", "forum_id": "452", "comment_id": "87918", "analyzed_at": "2026-05-05T18:48:47.920316+00:00", "model": "gpt-4o", "prompt_version": "17ff34f", "input_title": "An Essential Policy", "stance": "support", "stance_confidence": 1.0, "stance_rationale": "The commenter explicitly states 'I fully support this policy,' indicating clear approval of the regulation. They also describe it as 'essential for the health and wellbeing of our students and of our community,' reinforcing their supportive stance.", "tone": "positive", "tags": ["student wellbeing", "community support"], "truncated": false, "error": null}
--- a/analysis/gpt4o/forum452_unknown_gpt-4o_2026-05-05T20-43-59+00-00.jsonl
+++ b/analysis/gpt4o/forum452_unknown_gpt-4o_2026-05-05T20-43-59+00-00.jsonl
--- a/analysis/gpt4o/forum452_unknown_gpt-4o_2026-05-05T20-44-11+00-00.jsonl
+++ b/analysis/gpt4o/forum452_unknown_gpt-4o_2026-05-05T20-44-11+00-00.jsonl
@@ -1,10 +0,0 @@
-{"run_id": "e84adaf5-5250-42b9-97c1-59623bd99bc7", "forum_id": "452", "comment_id": "87914", "analyzed_at": "2026-05-05T20:44:11.731054+00:00", "model": "gpt-4o", "prompt_version": "cb41250", "input_title": "Support the Model Policy Wholeheartedly", "stance": "support", "stance_confidence": 1.0, "stance_rationale": "The commenter explicitly states, \"I support the model policy wholeheartedly,\" indicating clear support for the regulation. They also express appreciation for the policy's inclusivity and guidance, saying it is a \"first step in creating schools in Virginia that are inclusive and welcoming for transgender and non-binary students.\"", "tone": "positive", "tags": ["LGBTQ+ inclusion", "student safety", "school policy", "transgender rights", "educational support"], "truncated": false, "error": null}
-{"run_id": "e84adaf5-5250-42b9-97c1-59623bd99bc7", "forum_id": "452", "comment_id": "87915", "analyzed_at": "2026-05-05T20:44:14.418311+00:00", "model": "gpt-4o", "prompt_version": "cb41250", "input_title": "Please support this vital policy", "stance": "support", "stance_confidence": 1.0, "stance_rationale": "The commenter explicitly states, 'I strongly support these proposals,' indicating clear approval of the regulation. They also affirm the importance of treating every student with dignity and respect, aligning with the policy's goals.", "tone": "positive", "tags": ["LGBTQ+ inclusion", "student safety", "nondiscrimination"], "truncated": false, "error": null}
-{"run_id": "e84adaf5-5250-42b9-97c1-59623bd99bc7", "forum_id": "452", "comment_id": "87916", "analyzed_at": "2026-05-05T20:44:17.820090+00:00", "model": "gpt-4o", "prompt_version": "cb41250", "input_title": "Please support this policy", "stance": "support", "stance_confidence": 1.0, "stance_rationale": "The commenter explicitly states 'I am in full support of this policy guidance,' indicating clear support for the regulation. The phrase 'Trans rights are human rights' further reinforces their supportive stance.", "tone": "positive", "tags": ["transgender rights", "human rights"], "truncated": false, "error": null}
-{"run_id": "e84adaf5-5250-42b9-97c1-59623bd99bc7", "forum_id": "452", "comment_id": "87917", "analyzed_at": "2026-05-05T20:44:18.982080+00:00", "model": "gpt-4o", "prompt_version": "cb41250", "input_title": "Please support this policy", "stance": "support", "stance_confidence": 0.95, "stance_rationale": "The commenter explicitly states 'Please support this policy' and 'Please implement this policy,' indicating a clear support for the adoption of the regulation.", "tone": "positive", "tags": ["transgender rights", "student safety", "nondiscrimination"], "truncated": false, "error": null}
-{"run_id": "e84adaf5-5250-42b9-97c1-59623bd99bc7", "forum_id": "452", "comment_id": "87918", "analyzed_at": "2026-05-05T20:44:22.439016+00:00", "model": "gpt-4o", "prompt_version": "cb41250", "input_title": "An Essential Policy", "stance": "support", "stance_confidence": 1.0, "stance_rationale": "The commenter explicitly states 'I fully support this policy' and describes it as 'essential for the health and wellbeing of our students and of our community,' indicating clear approval of the regulation.", "tone": "positive", "tags": ["student wellbeing", "community support", "education policy"], "truncated": false, "error": null}
-{"run_id": "e84adaf5-5250-42b9-97c1-59623bd99bc7", "forum_id": "452", "comment_id": "87919", "analyzed_at": "2026-05-05T20:44:23.589115+00:00", "model": "gpt-4o", "prompt_version": "cb41250", "input_title": "Support from a School Counselor", "stance": "support", "stance_confidence": 1.0, "stance_rationale": "The commenter explicitly states support for the guidance, noting it will be 'incredibly helpful' and 'important in order to better support transgender students.' This indicates a clear approval of the proposed regulation.", "tone": "positive", "tags": ["LGBTQ+ inclusion", "student support", "mental health", "school counseling"], "truncated": false, "error": null}
-{"run_id": "e84adaf5-5250-42b9-97c1-59623bd99bc7", "forum_id": "452", "comment_id": "87920", "analyzed_at": "2026-05-05T20:44:25.159983+00:00", "model": "gpt-4o", "prompt_version": "cb41250", "input_title": "I support this policy", "stance": "support", "stance_confidence": 0.95, "stance_rationale": "The commenter explicitly states 'I support this policy' and expresses belief in the importance of a 'welcoming and nurturing environment' for transgender students, indicating clear support for the regulation.", "tone": "positive", "tags": ["LGBTQ+ inclusion", "student safety"], "truncated": false, "error": null}
-{"run_id": "e84adaf5-5250-42b9-97c1-59623bd99bc7", "forum_id": "452", "comment_id": "87921", "analyzed_at": "2026-05-05T20:44:28.076212+00:00", "model": "gpt-4o", "prompt_version": "cb41250", "input_title": "It’s about time!", "stance": "support", "stance_confidence": 0.95, "stance_rationale": "The commenter expresses clear support for the regulation by stating that the guidance is \"a long time coming and is desperately needed.\" This indicates a strong desire for the regulation to be adopted to address issues faced by transgender students, like their son.", "tone": "positive", "tags": ["bullying prevention", "LGBTQ+ inclusion", "student safety"], "truncated": false, "error": null}
-{"run_id": "e84adaf5-5250-42b9-97c1-59623bd99bc7", "forum_id": "452", "comment_id": "87922", "analyzed_at": "2026-05-05T20:44:29.673172+00:00", "model": "gpt-4o", "prompt_version": "cb41250", "input_title": "A long overdue policy", "stance": "support", "stance_confidence": 1.0, "stance_rationale": "The commenter expresses strong support for the policy, describing it as 'pro-equality' and 'evidence based,' and states that it would 'guarantee protections for transgender and gender variant youth.' The use of phrases like 'incredibly excited' and 'kudos to you, champions of equality!' further indicates a supportive stance.", "tone": "positive", "tags": ["LGBTQ+ inclusion", "student safety", "bullying prevention", "equality"], "truncated": false, "error": null}
-{"run_id": "e84adaf5-5250-42b9-97c1-59623bd99bc7", "forum_id": "452", "comment_id": "87923", "analyzed_at": "2026-05-05T20:44:35.056904+00:00", "model": "gpt-4o", "prompt_version": "cb41250", "input_title": "100% support", "stance": "support", "stance_confidence": 1.0, "stance_rationale": "The commenter explicitly states 'I totally support this needed policy,' indicating clear support for the regulation. They emphasize the importance of safety, support, and equality for all kids, aligning with the goals of the proposed regulation.", "tone": "positive", "tags": ["student safety", "LGBTQ+ inclusion", "nondiscrimination"], "truncated": false, "error": null}
--- a/analysis/gpt4o/requests/5b8714a7-0666-40a2-9d69-2d9ce9074406.jsonl
+++ b/analysis/gpt4o/requests/5b8714a7-0666-40a2-9d69-2d9ce9074406.jsonl
--- a/analysis/gpt4o/runs/5b8714a7-0666-40a2-9d69-2d9ce9074406.json
+++ b/analysis/gpt4o/runs/5b8714a7-0666-40a2-9d69-2d9ce9074406.json
@@ -1,16 +0,0 @@
-{
-  "run_id": "5b8714a7-0666-40a2-9d69-2d9ce9074406",
-  "input_filename": "output\\f452.jsonl",
-  "input_sha256": "59dcc8b13cc2a386977a8b934c498c7e639b7e684a94ca1bfd10a14878670018",
-  "prompt_hash": "cb41250",
-  "model": "gpt-4o",
-  "batch_id": "batch_69fa579c7cd081909c049715838df6c6",
-  "records_submitted": 9083,
-  "records_completed": 0,
-  "records_failed": 0,
-  "request_filename": "C:\\Users\\moses\\projects\\vath\\analysis\\gpt4o\\requests\\5b8714a7-0666-40a2-9d69-2d9ce9074406.jsonl",
-  "raw_output_filename": null,
-  "normalized_output_filename": null,
-  "created_at": "2026-05-05T20:48:28.268022+00:00",
-  "completed_at": null
-}
--- a/docs/pipeline-1.2.3.svg
+++ b/docs/pipeline-1.2.3.svg
--- a/docs/pipeline-v1.2.3.drawio
+++ b/docs/pipeline-v1.2.3.drawio
@@ -0,0 +1,99 @@
+<mxfile host="app.diagrams.net">
+  <diagram name="Page-1" id="0sW-Vs8X5usvYmJikUIv">
+    <mxGraphModel dx="2179" dy="1118" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="0" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0">
+      <root>
+        <mxCell id="0" />
+        <mxCell id="1" parent="0" />
+        <mxCell id="mENAtx_syaeSO5uR6kG6-3" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-1" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0;entryDx=16.5;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-29">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="200" y="290" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-1" parent="1" style="shape=process;whiteSpace=wrap;html=1;backgroundOutline=1;" value="scraper" vertex="1">
+          <mxGeometry height="60" width="120" x="40" y="170" as="geometry" />
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-46" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-5" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" target="mENAtx_syaeSO5uR6kG6-34">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-5" parent="1" style="shape=process;whiteSpace=wrap;html=1;backgroundOutline=1;" value="tokenizer" vertex="1">
+          <mxGeometry height="60" width="120" x="400" y="170" as="geometry" />
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-6" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=center;verticalAlign=top;rounded=0;" value="gather forum data" vertex="1">
+          <mxGeometry height="60" width="120" x="20" y="240" as="geometry" />
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-7" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="&lt;div&gt;tokenize forum,&lt;/div&gt;&lt;div&gt;generate report w/&lt;/div&gt;&lt;div&gt;recommendations&lt;/div&gt;" vertex="1">
+          <mxGeometry height="60" width="120" x="400" y="240" as="geometry" />
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-28" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-35">
+          <mxGeometry relative="1" as="geometry">
+            <mxPoint x="910" y="270" as="targetPoint" />
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-19" parent="1" style="shape=process;whiteSpace=wrap;html=1;backgroundOutline=1;" value="batch" vertex="1">
+          <mxGeometry height="60" width="120" x="720" y="170" as="geometry" />
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-21" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=right;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="&lt;div&gt;--model&lt;/div&gt;&lt;div&gt;--limit&lt;/div&gt;" vertex="1">
+          <mxGeometry height="60" width="120" x="590" y="210" as="geometry" />
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-23" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=right;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="--forum" vertex="1">
+          <mxGeometry height="60" width="120" x="-90" y="170" as="geometry" />
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-25" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=right;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="--prompt" vertex="1">
+          <mxGeometry height="60" width="120" x="270" y="210" as="geometry" />
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-26" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="&lt;div&gt;split job into batches&lt;/div&gt;&lt;div&gt;submit first batch&lt;/div&gt;&lt;div&gt;status of current batch&lt;/div&gt;&lt;div&gt;download batch artifacts&lt;/div&gt;" vertex="1">
+          <mxGeometry height="60" width="120" x="720" y="240" as="geometry" />
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-29" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="" vertex="1">
+          <mxGeometry height="70" width="50" x="210" y="240" as="geometry" />
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-30" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="" vertex="1">
+          <mxGeometry height="70" width="50" x="220" y="250" as="geometry" />
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-45" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-31" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0;exitDx=50;exitDy=43.5;exitPerimeter=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" target="mENAtx_syaeSO5uR6kG6-5">
+          <mxGeometry relative="1" as="geometry">
+            <Array as="points">
+              <mxPoint x="320" y="304" />
+              <mxPoint x="320" y="200" />
+            </Array>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-31" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;forum&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1">
+          <mxGeometry height="70" width="50" x="230" y="260" as="geometry" />
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-47" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-34" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0;exitDx=50;exitDy=43.5;exitPerimeter=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" target="mENAtx_syaeSO5uR6kG6-19">
+          <mxGeometry relative="1" as="geometry">
+            <Array as="points">
+              <mxPoint x="640" y="284" />
+              <mxPoint x="640" y="200" />
+            </Array>
+          </mxGeometry>
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-34" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;report&lt;/div&gt;&lt;div&gt;.json&lt;/div&gt;" vertex="1">
+          <mxGeometry height="70" width="50" x="560" y="240" as="geometry" />
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-35" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="job.json" vertex="1">
+          <mxGeometry height="70" width="50" x="890" y="240" as="geometry" />
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-41" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="" vertex="1">
+          <mxGeometry height="70" width="50" x="940" y="340" as="geometry" />
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-42" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="" vertex="1">
+          <mxGeometry height="70" width="50" x="950" y="350" as="geometry" />
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-43" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;batchN-&lt;/div&gt;&lt;div&gt;output-&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1">
+          <mxGeometry height="70" width="50" x="960" y="360" as="geometry" />
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-48" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;errors&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1">
+          <mxGeometry height="70" width="50" x="980" y="240" as="geometry" />
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-51" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0;entryDx=16.5;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-41">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+        <mxCell id="mENAtx_syaeSO5uR6kG6-53" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0;entryDx=16.5;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-48">
+          <mxGeometry relative="1" as="geometry" />
+        </mxCell>
+      </root>
+    </mxGraphModel>
+  </diagram>
+</mxfile>
--- a/docs/tasks.org
+++ b/docs/tasks.org
@@ -104,7 +104,7 @@ Reference: ./docs/openai-batch.md. openai batch output order is not guaranteed,
 - tests: 18 passing (pytest tests/analysis_gpt4o_batch.py), 46 total across suite
 - datetime: [2026-05-05 Tue 17:00]

-* [ ] t1.2.2: Tokenizer / Batch mgmt
+* [X] t1.2.2: Tokenizer / Batch mgmt
 openai batch analysis requires coordination - more like a job queue.
 batch script should setup queue for user to setup manually; openai api will reject subsequent batches when the total daily token limit is maxed.
 ** Acceptance Criteria
@@ -117,17 +117,136 @@ batch script should setup queue for user to setup manually; openai api will reje
   - Each chunk becomes its own batch submission with its own run_id.
   - Drop --limit (or keep as hard cap override).
   - Print all run_ids
-   - Submit the first batch only
+   - Submit the first batch only (failed)
 4. Update test script to show tokenizer output

 ** notes
+- MODEL_LIMITS and _MODEL_ENCODING dicts in analysis/gpt4o/analysis_batch.py; keyed by model name, sourced from docs/openai.md. Unknown models fall back to o200k_base encoding and 900k token limit.
+- estimate_tokens(messages, model): uses tiktoken (o200k_base) when available; falls back to chars/3 + 4 overhead per message.
+- chunk_comments_by_tokens(comments, forum, model): greedy bin-pack; respects 10% headroom (_LIMIT_BUFFER=0.90). Returns list of comment lists.
+- submit sends only chunks[0] — enqueued token limit is a TOTAL across all concurrent batches; stacking would exceed quota. Remaining chunk ranges are printed as manual instructions.
+- --limit N still available as a hard cap on total comments before chunking (useful when org-tier limit is below the published model limit).
+- pip install tiktoken required for exact token counting; chars/3 fallback activates automatically if not installed.
+
+  
+*** usage
+- `pip install tiktoken`
+- submit first chunk (auto-sized to model token limit, uses most recent output file)
+  `python analysis/gpt4o/analysis_batch.py submit output/f452.jsonl --model gpt-4o-mini`
+- check status (defaults to most recent run)
+  `python analysis/gpt4o/analysis_batch.py status`
+- download + normalize when complete
+  `python analysis/gpt4o/analysis_batch.py download`
+- submit next chunk: rerun with `--limit` to cover the next N comments
+  (track which comment_ids have already been analyzed to avoid duplicates)
+
+*** validation
+#+begin_src python
+import pandas as pd
+df_input = pd.read_json('C:/Users/moses/projects/vath/analysis/gpt4o/runs/75ee9a/f452.jsonl', lines=True)
+# drop forum item
+df_input_comments = df_input[df_input["comment_id"].notna()].copy()
+df_output = pd.read_json('C:/Users/moses/projects/vath/analysis/gpt4o/runs/75ee9a/75ee9a6c-8fc2-4924-8d96-b55bb4d5e832_gpt-4o.jsonl', lines=True)
+dfm = df_output.merge(df_input_comments,on="comment_id",how="left",suffixes=("","_input"),)
+dfm.to_csv('C:/Users/moses/projects/vath/analysis/gpt4o/1.csv')
+#+end_src
+order columns:
+forum_id_input,comment_id,title,text,date,author,stance,stance_confidence,stance_rationale,tone,tags,error,truncated,analyzed_at,prompt_version,model
+
+** evidence
+- commit:
+- tests: 23 passing (pytest tests/analysis_gpt4o_batch.py), 51 total across suite
+- datetime: [2026-05-06 Wed 08:55]
+
+* [ ] t1.2.3: batch job refactor
+This task encompasses intent and fixes for 1.2.1 and 1.2.2.
+batch processing should  be a resumable job queue, not a one-shot script. the user should not need to remember offsets, completed chunks, failed batches, or which comments remain.
+** Acceptance Criteria
+1. create tokenizer to prepare the batch job
+   - input: prompt.txt, forum.jsonl
+   - output: report.json with each model's batch structure, cost, and time (considering tpd constraints)
+     - analysis_batch should be able to take this report to run the job. good place to copy the raw scraper jsonl
+     #+begin_src python
+       {'prompt': 'prompt1.txt',
+        'input_file': 'f451.jsonl',
+        'input_tokens': 123456789,
+        'gpt-4o': {'jobs':71,'cost_$':4,'est_queue_days':3} # divide tokens by model TPD to get time_days
+        'gpt-4o-mini': {'jobs':71,'cost_$':4,'est_queue_days':3} # divide tokens by model TPD to get time_days
+     #+end_src
+2. batch py should contain commands to create, check, run, and complete jobs.
+   - inputs: report.json, --model, optional --job N, read api key from .env
+   - outputs:
+     - status.json: job structure, status, metadata; updated when jobs are finished. includes all report.json info
+     - for each job: jobN-input.jsonl (what is sent to openai); jobN-output-raw.jsonl, jobN-output.jsonl, and jobN-errors.jsonl (when downloaded)       
+     - jobN-output.jsonl contains:
+       - one analysis record per comment
+       - `run_id`, `forum_id`, `comment_id`, `analyzed_at`, `model`, `prompt_version`
+       - `stance` toward proposed reg/guidance: support|oppose|neutral|unclear
+       - `stance_confidence`: 0-1
+       - short rationale, if provided by model
+       - generic sentiment `tone` (separate from stance): positive|negative|neutral|mixed|unclear
+       - `tags` for later grouping, may be empty
+   - commands: `create`, `submit`, `status`, `download`
+     - `create` run directory, copy input/prompt/report, generate status.json, job request files
+     - `submit` if eligible, submit next or specified job; does not blindly stack jobs, warns if prev jobs in progress, print next action
+     - `status` check status of one or all submitted jobs, update status.json
+     - `download` raw output (jobN-output-raw.jsonl) and error files for completed jobs, and normalize raw output (jobN-output.jsonl) auto run status.
+3. tests without live api calls
+   - partial completed run
+   - failed batch records
+   - out-of-order output
+   - duplicate custom_id
+   - missing output file
+   - resume from status.json
+   - remaining-comment detection
+
+* === Backlog ===
+* [ ] X: analysis validation view
+create a lightweight validation script that joins raw comments to normalized analysis output and writes a human-reviewable csv.
+
+** acceptance criteria
+1. input raw scrape jsonl and all *-output.jsonl files in a dir
+2. join by comment_id, not dataframe index
+3. output csv columns in review order:
+   - forum_id, comment_id, title, text, date, author
+   - stance, stance_confidence, stance_rationale, tone, tags
+   - error, truncated, analyzed_at, prompt_version, model
+4. print validation counts
+   - raw comments
+   - analyzed records
+   - joined records
+   - missing comment text
+   - duplicate comment_ids
+   - error records
+   - stance counts
+   - tone counts
+5. tests cover join behavior and missing/duplicate ids

 ** evidence
 - commit:
 - tests:
- datetime:   
+- csv:
+- datetime:       
+* [ ] X: text encoding cleanup
+fix mojibake in scraped text before analysis/reporting, especially curly quotes showing as â€™.

-   
+** acceptance criteria
+1. identify whether mojibake exists in raw scrape, analysis output, or csv export only
+2. add repair step at the earliest correct layer
+3. preserve original raw scrape if repair changes source text
+4. add test cases for common bad sequences:
+   - â€™
+   - â€œ
+   - â€
+   - â€“
+   - â€”
+5. document whether repaired text is used for model input
+
+** evidence
+- commit:
+- tests:
+- before/after sample:
+- datetime:
 * [ ] X: complete proposal information
 Ensure we capture as much useful information as possible about the actual proposal - contact information, etc. what the state actually says about what was posted. 
 ** acceptance criteria
--- a/tests/analysis_gpt4o_batch.py
+++ b/tests/analysis_gpt4o_batch.py
@@ -250,3 +250,62 @@ def test_manifest_save_load_roundtrip(tmp_path, monkeypatch):
    bt.save_manifest(m)
    loaded = bt.load_manifest(RUN_ID)
    assert loaded == m
+
+
+# ---------------------------------------------------------------------------
+# estimate_tokens
+
+def test_estimate_tokens_returns_positive_int():
+    messages = [{"role": "system", "content": "hello"}, {"role": "user", "content": "world"}]
+    result = bt.estimate_tokens(messages, "gpt-4o-mini")
+    assert isinstance(result, int)
+    assert result > 0
+
+
+def test_estimate_tokens_longer_content_is_larger():
+    short_msg = [{"role": "user", "content": "hi"}]
+    long_msg  = [{"role": "user", "content": "hi " * 500}]
+    assert bt.estimate_tokens(long_msg, "gpt-4o-mini") > bt.estimate_tokens(short_msg, "gpt-4o-mini")
+
+
+def test_estimate_tokens_fallback_without_tiktoken(monkeypatch):
+    import sys as _sys
+    monkeypatch.setitem(_sys.modules, "tiktoken", None)
+    messages = [{"role": "user", "content": "x" * 300}]
+    result = bt.estimate_tokens(messages, "gpt-4o")
+    assert result == 4 + 300 // 3
+
+
+# ---------------------------------------------------------------------------
+# chunk_comments_by_tokens
+
+def test_chunk_single_chunk_for_small_input(monkeypatch):
+    monkeypatch.setattr(bt, "MODEL_LIMITS", {"gpt-4o-mini": 10_000_000})
+    comments = [COMMENT_ITEM, {**COMMENT_ITEM, "comment_id": "99999"}]
+    chunks = bt.chunk_comments_by_tokens(comments, FORUM_ITEM, "gpt-4o-mini")
+    assert len(chunks) == 1
+    assert len(chunks[0]) == 2
+
+
+def test_chunk_splits_when_over_limit(monkeypatch):
+    monkeypatch.setattr(bt, "MODEL_LIMITS", {"gpt-4o-mini": 1})
+    comments = [
+        COMMENT_ITEM,
+        {**COMMENT_ITEM, "comment_id": "99999"},
+        {**COMMENT_ITEM, "comment_id": "88888"},
+    ]
+    chunks = bt.chunk_comments_by_tokens(comments, FORUM_ITEM, "gpt-4o-mini")
+    assert len(chunks) == len(comments)
+
+
+def test_chunk_preserves_all_comments(monkeypatch):
+    monkeypatch.setattr(bt, "MODEL_LIMITS", {"gpt-4o-mini": 200})
+    comments = [{**COMMENT_ITEM, "comment_id": str(i)} for i in range(10)]
+    chunks = bt.chunk_comments_by_tokens(comments, FORUM_ITEM, "gpt-4o-mini")
+    flat = [c for chunk in chunks for c in chunk]
+    assert len(flat) == 10
+
+
+def test_model_limits_has_required_models():
+    for model in ("gpt-4o", "gpt-4o-mini", "gpt-5.4", "gpt-5.4-mini", "gpt-o4-mini"):
+        assert model in bt.MODEL_LIMITS, f"{model} missing from MODEL_LIMITS"
Author	SHA1	Message	Date
eulaly	946aeac7c8	cleanup	2026-05-06 13:30:53 -04:00
eulaly	e1ad4432a7	refactor/batch-openai prep	2026-05-06 13:29:59 -04:00