completed openai batch work

This commit is contained in:
2026-05-07 07:24:11 -04:00
parent 64a7a18721
commit f5d679808e
29 changed files with 36711 additions and 83 deletions

View File

@@ -43,3 +43,4 @@ Description and PM notes
- project dir: `%userprofile%\projects\vath\`
- python venv: `%userprofile%\projects\vath\venv\scripts\activate`
- pytest (inside venv): `python -m pytest tests/`
- create tests without `test_` prefix, ie: `tests/tokenizer.py` not `tests/test_tokenizer.py`

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,23 @@
You are an expert policy analyst classifying public comments submitted to the Virginia Town Hall
regulatory comment system. You will be given the text of a proposed regulation and a single
public comment. Return ONLY a JSON object — no other text.
Definitions:
- stance: the commenter's position on whether the regulation should be adopted.
"support" = wants it approved (as-is or with changes);
"oppose" = wants it rejected or substantially weakened;
"neutral" = takes no position, asks a question, or provides factual input only;
"unknown" = too vague, off-topic, or uninterpretable to classify.
- tone: the emotional register of the writing, independent of stance.
"positive" = affirming, hopeful, appreciative;
"negative" = angry, fearful, alarmed, or contemptuous;
"neutral" = matter-of-fact, procedural, or informational;
"mixed" = contains both positive and negative emotional content;
"unclear" = tone cannot be determined (e.g., a one-word comment).
- stance_confidence: float 0.0-1.0, your confidence in the stance label.
- stance_rationale: 1-3 sentences explaining the key evidence; quote specific phrases where possible.
- tags: up to 5 short topic labels relevant to the comment's specific concerns (e.g.
"parental rights", "student safety", "privacy", "religious freedom", "LGBTQ+ inclusion",
"bullying prevention", "school sports", "bathroom access"). Empty array if none apply.
Return exactly these keys: stance, stance_confidence, stance_rationale, tone, tags.

View File

@@ -0,0 +1,43 @@
{
"prompt": "analysis\\prompt-1.txt",
"prompt_hash": "cb41250",
"input_file": "output\\f452.jsonl",
"input_sha256": "59dcc8b13cc2a386977a8b934c498c7e639b7e684a94ca1bfd10a14878670018",
"total_comments": 9083,
"input_tokens": 6397254,
"gpt-5.5": {
"jobs": 9,
"cost_$": 15.9931,
"est_queue_days": 7.11
},
"gpt-5.4": {
"jobs": 9,
"cost_$": 7.9966,
"est_queue_days": 7.11
},
"gpt-5.4-mini": {
"jobs": 4,
"cost_$": 2.399,
"est_queue_days": 3.2
},
"gpt-5.4-nano": {
"jobs": 40,
"cost_$": 0.6397,
"est_queue_days": 31.99
},
"gpt-4o": {
"jobs": 9,
"cost_$": 7.9966,
"est_queue_days": 7.11
},
"gpt-4o-mini": {
"jobs": 4,
"cost_$": 0.4798,
"est_queue_days": 3.2
},
"gpt-o4-mini": {
"jobs": 4,
"cost_$": 3.5185,
"est_queue_days": 3.2
}
}

View File

@@ -0,0 +1,57 @@
{
"model": "gpt-5.4-mini",
"prompt_hash": "cb41250",
"input_file": "output\\f452.jsonl",
"input_sha256": "59dcc8b13cc2a386977a8b934c498c7e639b7e684a94ca1bfd10a14878670018",
"total_comments": 9083,
"input_tokens": 6397254,
"est_queue_days": 3.2,
"cost_$": 2.399,
"total_jobs": 4,
"jobs": [
{
"job_num": 1,
"run_id": "76c97113-63aa-43db-8f84-9c60ebcbb105",
"status": "completed",
"batch_id": "batch_69fb9081639881909be0c40d86edd747",
"records_submitted": 2270,
"records_completed": 2270,
"records_failed": 0,
"submitted_at": "2026-05-06T19:03:28.949240+00:00",
"completed_at": "2026-05-06T20:09:14+00:00"
},
{
"job_num": 2,
"run_id": "b8f3b0bb-f155-4a5c-acce-f3504c0e09aa",
"status": "completed",
"batch_id": "batch_69fba02df7b481909e96afa1ee8879f5",
"records_submitted": 2274,
"records_completed": 2274,
"records_failed": 0,
"submitted_at": "2026-05-06T20:10:21.424330+00:00",
"completed_at": "2026-05-06T20:37:11+00:00"
},
{
"job_num": 3,
"run_id": "8d769f37-6beb-4a1b-87ee-3f66cdc6adc8",
"status": "completed",
"batch_id": "batch_69fba69a85488190977792b6f95b614b",
"records_submitted": 2282,
"records_completed": 2282,
"records_failed": 0,
"submitted_at": "2026-05-06T20:37:45.586815+00:00",
"completed_at": "2026-05-06T21:09:24+00:00"
},
{
"job_num": 4,
"run_id": "e6affbc2-ddc9-43a6-b8e9-d1f47e736283",
"status": "completed",
"batch_id": "batch_69fbe44565748190ad19f17ee3143f8d",
"records_submitted": 2257,
"records_completed": 2257,
"records_failed": 0,
"submitted_at": "2026-05-07T01:00:52.886953+00:00",
"completed_at": "2026-05-07T09:20:01+00:00"
}
]
}

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python3
"""
analysis_batch.py OpenAI Batch API job runner
openai_batch.py OpenAI Batch API job runner
Run tokenizer.py first to generate report.json, then:
create <report.json> --model <model> build job directory
@@ -8,7 +8,7 @@ Run tokenizer.py first to generate report.json, then:
status [--job N] [--dir DIR] check job status
download [--job N] [--dir DIR] download + normalize completed jobs
DIR is a name under analysis/gpt4o/jobs/ (default: most recently created).
DIR is a name under analysis/jobs/ (default: most recently created).
"""
import argparse
@@ -52,17 +52,24 @@ _MODEL_ENCODING: dict[str, str] = {
"gpt-4o-mini": "o200k_base",
"gpt-o4-mini": "o200k_base",
}
_LIMIT_BUFFER = 0.90
_LIMIT_BUFFER = 0.80
def estimate_tokens(messages: list[dict], model: str) -> int:
"""Exact token count via tiktoken; falls back to chars/3 + 4 overhead per message."""
"""Token count per OpenAI cookbook chat formula; falls back to chars/3."""
try:
import tiktoken
enc = tiktoken.get_encoding(_MODEL_ENCODING.get(model, "o200k_base"))
return sum(4 + len(enc.encode(m["content"])) for m in messages)
# Per OpenAI cookbook for gpt-4o: 3 overhead per message + role + content;
# plus 3 tokens for the reply primer (<|start|>assistant<|message|>).
total = 3 # reply primer
for m in messages:
total += 3
total += len(enc.encode(m.get("role", "")))
total += len(enc.encode(m["content"]))
return total
except ImportError:
return sum(4 + len(m["content"]) // 3 for m in messages)
return 3 + sum(3 + len(m["content"]) // 3 for m in messages)
def chunk_comments_by_tokens(
@@ -91,7 +98,7 @@ def chunk_comments_by_tokens(
# ---------------------------------------------------------------------------
# Prompt
_DEFAULT_PROMPT_FILE = Path(__file__).parent.parent / "prompt-1.txt"
_DEFAULT_PROMPT_FILE = Path(__file__).parent / "prompt-1.txt"
SYSTEM_PROMPT = _DEFAULT_PROMPT_FILE.read_text(encoding="utf-8").strip()
PROMPT_VERSION = hashlib.sha256(SYSTEM_PROMPT.encode("utf-8")).hexdigest()[:7]
@@ -375,7 +382,7 @@ def cmd_create(args) -> None:
print(f"Created: {job_dir.name}")
print(f" {len(chunks)} job(s) | {len(comments)} comments | model: {args.model}")
print(f"\nNext: python analysis/gpt4o/analysis_batch.py submit")
print(f"\nNext: python analysis/openai_batch.py submit")
# ---------------------------------------------------------------------------
@@ -431,7 +438,7 @@ def cmd_submit(args, client) -> None:
save_status(status, job_dir)
print(f"Job {n} submitted: {batch.id} ({batch.status})")
print(f" python analysis/gpt4o/analysis_batch.py status")
print(f" python analysis/openai_batch.py status")
# ---------------------------------------------------------------------------

View File

@@ -1,12 +1,12 @@
#!/usr/bin/env python3
"""
analysis/gpt4o/analysis-realtime.py Synchronous GPT-4o pipeline for VA Townhall comments.
analysis/openai_realtime.py Synchronous GPT-4o pipeline for VA Townhall comments.
Usage:
python analysis/gpt4o/analysis-realtime.py <input_jsonl> [--limit {5,10,20,50}] [--model MODEL]
python analysis/openai_realtime.py <input_jsonl> [--limit {5,10,20,50}] [--model MODEL]
Output:
analysis/gpt4o/forum{id}_{scrape_ts}_{model}_{run_ts}.jsonl
analysis/forum{id}_{scrape_ts}_{model}_{run_ts}.jsonl
"""
import argparse
@@ -30,7 +30,7 @@ except ImportError:
# ---------------------------------------------------------------------------
# Prompt — loaded from analysis/prompt-1.txt at import time
_PROMPT_FILE = Path(__file__).parent.parent / "prompt-1.txt"
_PROMPT_FILE = Path(__file__).parent / "prompt-1.txt"
SYSTEM_PROMPT = _PROMPT_FILE.read_text(encoding="utf-8").strip()
PROMPT_VERSION = hashlib.sha256(SYSTEM_PROMPT.encode("utf-8")).hexdigest()[:7]

View File

@@ -3,10 +3,11 @@
tokenizer.py estimate token usage and cost for a batch analysis run.
Usage:
python analysis/gpt4o/tokenizer.py output/f452.jsonl [--prompt analysis/prompt-1.txt]
python analysis/tokenizer.py output/f452.jsonl [--prompt analysis/prompt-1.txt]
python analysis/tokenizer.py analysis/jobs/f452-1/job1-input.jsonl # count actual tokens in a job
Prints a per-model comparison table and writes report.json next to the input file.
Run this before analysis_batch.py create.
Prints a per-model comparison table and writes reports/<stem>-report.json.
Run this before openai_batch.py create.
"""
import argparse
@@ -17,7 +18,7 @@ import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
import analysis_batch as _ab
import openai_batch as _ab
# Input pricing ($/1M tokens, batch API) — from docs/openai.md, updated 2026-05-05.
# Add Anthropic/other models here when needed; only models with a LIMITS entry are reported.
@@ -66,6 +67,32 @@ def compute_report(
return report
def count_input_tokens(path: Path, model: str = "gpt-4o") -> dict:
"""Count tokens in an existing job input JSONL (batch request format).
Each line must have body.messages (as written by build_batch_request_line).
Returns {"total_tokens": int, "total_requests": int, "min": int, "max": int, "mean": float}.
"""
counts = []
with open(path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
req = json.loads(line)
messages = req["body"]["messages"]
counts.append(_ab.estimate_tokens(messages, model))
if not counts:
return {"total_tokens": 0, "total_requests": 0, "min": 0, "max": 0, "mean": 0.0}
return {
"total_tokens": sum(counts),
"total_requests": len(counts),
"min": min(counts),
"max": max(counts),
"mean": round(sum(counts) / len(counts), 1),
}
def print_table(report: dict) -> None:
"""Print a human-readable model comparison table to stdout."""
print(f"\nInput: {report['input_file']}")
@@ -90,11 +117,21 @@ def print_table(report: dict) -> None:
print()
def _is_job_input(path: Path) -> bool:
"""Return True if this JSONL looks like a batch request file (has custom_id)."""
with open(path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
return "custom_id" in json.loads(line)
return False
def main() -> None:
_default_prompt = Path(__file__).parent.parent / "prompt-1.txt"
_default_prompt = Path(__file__).parent / "prompt-1.txt"
parser = argparse.ArgumentParser(description="Estimate batch token usage and cost.")
parser.add_argument("input", help="Scraped JSONL file")
parser.add_argument("input", help="Scraped JSONL or job input JSONL (jobN-input.jsonl)")
parser.add_argument(
"--prompt",
default=str(_default_prompt),
@@ -106,6 +143,16 @@ def main() -> None:
if not input_path.exists():
sys.exit(f"File not found: {input_path}")
# --- Mode: count tokens in an existing job input file ---
if _is_job_input(input_path):
result = count_input_tokens(input_path)
print(f"\nJob input: {input_path.name}")
print(f" Requests : {result['total_requests']:,}")
print(f" Tokens : {result['total_tokens']:,}")
print(f" Per-req : min={result['min']} max={result['max']} mean={result['mean']}")
return
# --- Mode: estimate from raw scrape file and write report.json ---
prompt_path = Path(args.prompt)
if not prompt_path.exists():
sys.exit(f"Prompt file not found: {prompt_path}")
@@ -131,10 +178,12 @@ def main() -> None:
print_table(report)
out_path = input_path.parent / "report.json"
reports_dir = Path(__file__).parent.parent / "reports"
reports_dir.mkdir(exist_ok=True)
out_path = reports_dir / f"{input_path.stem}-report.json"
out_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
print(f"Report written to: {out_path}")
print(f"\nNext: python analysis/gpt4o/analysis_batch.py create {out_path} --model <model>")
print(f"\nNext: python analysis/openai_batch.py create {out_path} --model <model>")
if __name__ == "__main__":

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 130 KiB

View File

@@ -1,9 +1,18 @@
<mxfile host="app.diagrams.net">
<diagram name="Page-1" id="0sW-Vs8X5usvYmJikUIv">
<mxGraphModel dx="2179" dy="1118" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="0" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0">
<mxGraphModel dx="1315" dy="798" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="0" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0">
<root>
<mxCell id="0" />
<mxCell id="1" parent="0" />
<mxCell id="mENAtx_syaeSO5uR6kG6-61" parent="1" style="rounded=0;whiteSpace=wrap;html=1;" value="" vertex="1">
<mxGeometry height="90" width="190" x="1000" y="330" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-60" parent="1" style="rounded=0;whiteSpace=wrap;html=1;" value="" vertex="1">
<mxGeometry height="90" width="190" x="1010" y="340" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-59" parent="1" style="rounded=0;whiteSpace=wrap;html=1;" value="" vertex="1">
<mxGeometry height="90" width="190" x="1020" y="350" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-3" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-1" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0;entryDx=16.5;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-29">
<mxGeometry relative="1" as="geometry">
<mxPoint x="200" y="290" as="targetPoint" />
@@ -18,18 +27,18 @@
<mxCell id="mENAtx_syaeSO5uR6kG6-5" parent="1" style="shape=process;whiteSpace=wrap;html=1;backgroundOutline=1;" value="tokenizer" vertex="1">
<mxGeometry height="60" width="120" x="400" y="170" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-6" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=center;verticalAlign=top;rounded=0;" value="gather forum data" vertex="1">
<mxGeometry height="60" width="120" x="20" y="240" as="geometry" />
<mxCell id="mENAtx_syaeSO5uR6kG6-6" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="&lt;div align=&quot;left&quot;&gt;- collect forum data&lt;/div&gt;" vertex="1">
<mxGeometry height="60" width="120" x="40" y="240" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-7" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="&lt;div&gt;tokenize forum,&lt;/div&gt;&lt;div&gt;generate report w/&lt;/div&gt;&lt;div&gt;recommendations&lt;/div&gt;" vertex="1">
<mxCell id="mENAtx_syaeSO5uR6kG6-7" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="&lt;div&gt;- tokenize forum&lt;/div&gt;&lt;div&gt;- generate report w/&lt;/div&gt;&lt;div&gt;recommendations&lt;/div&gt;" vertex="1">
<mxGeometry height="60" width="120" x="400" y="240" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-28" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-35">
<mxCell id="mENAtx_syaeSO5uR6kG6-28" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" target="mENAtx_syaeSO5uR6kG6-73">
<mxGeometry relative="1" as="geometry">
<mxPoint x="910" y="270" as="targetPoint" />
<mxPoint x="953" y="240" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-19" parent="1" style="shape=process;whiteSpace=wrap;html=1;backgroundOutline=1;" value="batch" vertex="1">
<mxCell id="mENAtx_syaeSO5uR6kG6-19" parent="1" style="shape=process;whiteSpace=wrap;html=1;backgroundOutline=1;" value="openai_batch" vertex="1">
<mxGeometry height="60" width="120" x="720" y="170" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-21" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=right;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="&lt;div&gt;--model&lt;/div&gt;&lt;div&gt;--limit&lt;/div&gt;" vertex="1">
@@ -38,11 +47,8 @@
<mxCell id="mENAtx_syaeSO5uR6kG6-23" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=right;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="--forum" vertex="1">
<mxGeometry height="60" width="120" x="-90" y="170" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-25" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=right;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="--prompt" vertex="1">
<mxGeometry height="60" width="120" x="270" y="210" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-26" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="&lt;div&gt;split job into batches&lt;/div&gt;&lt;div&gt;submit first batch&lt;/div&gt;&lt;div&gt;status of current batch&lt;/div&gt;&lt;div&gt;download batch artifacts&lt;/div&gt;" vertex="1">
<mxGeometry height="60" width="120" x="720" y="240" as="geometry" />
<mxCell id="mENAtx_syaeSO5uR6kG6-26" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="&lt;div&gt;- split job into batches&lt;/div&gt;&lt;div&gt;- submit first batch&lt;/div&gt;&lt;div&gt;- status of current batch&lt;/div&gt;&lt;div&gt;- download batch artifacts&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="140" x="720" y="240" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-29" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="" vertex="1">
<mxGeometry height="70" width="50" x="210" y="240" as="geometry" />
@@ -58,7 +64,7 @@
</Array>
</mxGeometry>
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-31" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;forum&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1">
<mxCell id="mENAtx_syaeSO5uR6kG6-31" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;&amp;lt;forumid&amp;gt;&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="230" y="260" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-47" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-34" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0;exitDx=50;exitDy=43.5;exitPerimeter=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" target="mENAtx_syaeSO5uR6kG6-19">
@@ -69,30 +75,42 @@
</Array>
</mxGeometry>
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-34" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;report&lt;/div&gt;&lt;div&gt;.json&lt;/div&gt;" vertex="1">
<mxCell id="mENAtx_syaeSO5uR6kG6-34" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;&lt;br&gt;&lt;/div&gt;&lt;div&gt;&amp;lt;forumid&amp;gt;&lt;br&gt;-report&lt;/div&gt;&lt;div&gt;.json&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="560" y="240" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-35" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="job.json" vertex="1">
<mxGeometry height="70" width="50" x="890" y="240" as="geometry" />
<mxCell id="mENAtx_syaeSO5uR6kG6-35" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;status&lt;/div&gt;&lt;div&gt;.json&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="913.25" y="360" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-41" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="" vertex="1">
<mxGeometry height="70" width="50" x="940" y="340" as="geometry" />
<mxCell id="mENAtx_syaeSO5uR6kG6-43" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;jobN-&lt;/div&gt;&lt;div&gt;output&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="1090" y="360" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-42" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="" vertex="1">
<mxGeometry height="70" width="50" x="950" y="350" as="geometry" />
<mxCell id="mENAtx_syaeSO5uR6kG6-48" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;jobN-errors&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="1150" y="360" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-43" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;batchN-&lt;/div&gt;&lt;div&gt;output-&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="960" y="360" as="geometry" />
<mxCell id="mENAtx_syaeSO5uR6kG6-54" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;jobN-&lt;/div&gt;&lt;div&gt;input&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="1030" y="360" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-48" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;errors&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="980" y="240" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-51" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0;entryDx=16.5;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-41">
<mxCell id="mENAtx_syaeSO5uR6kG6-64" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-63" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0;exitDx=50;exitDy=43.5;exitPerimeter=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" target="mENAtx_syaeSO5uR6kG6-5">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-53" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0;entryDx=16.5;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-48">
<mxCell id="mENAtx_syaeSO5uR6kG6-63" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;prompt&lt;/div&gt;&lt;div&gt;.txt&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="270" y="90" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-67" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="create" vertex="1">
<mxGeometry height="20" width="120" x="850" y="170" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-71" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="&lt;div&gt;submit&lt;/div&gt;&lt;div&gt;&lt;br&gt;&lt;/div&gt;&lt;div&gt;status&lt;/div&gt;&lt;div&gt;download&lt;/div&gt;" vertex="1">
<mxGeometry height="60" width="120" x="1020" y="240" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-75" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-73" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;" target="mENAtx_syaeSO5uR6kG6-35">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-76" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-73" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" target="mENAtx_syaeSO5uR6kG6-61">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-73" parent="1" style="image;aspect=fixed;perimeter=ellipsePerimeter;html=1;align=center;shadow=0;dashed=0;spacingTop=3;image=img/lib/active_directory/folder.svg;" value="&amp;lt;forumid&amp;gt;-N" vertex="1">
<mxGeometry height="50" width="36.5" x="920" y="240" as="geometry" />
</mxCell>
</root>
</mxGraphModel>
</diagram>

4
docs/pipeline-v1.2.3.svg Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 170 KiB

View File

@@ -201,44 +201,48 @@ batch processing should be a resumable job queue, not a one-shot script. the us
- remaining-comment detection
** notes
- analysis/gpt4o/tokenizer.py: new standalone script; imports analysis_batch for MODEL_LIMITS, estimate_tokens, build_messages. Reads input JSONL + prompt, computes per-model jobs/cost/time table, writes report.json to input file's directory. MODEL_PRICING dict lives here (not in analysis_batch).
- analysis/gpt4o/analysis_batch.py: fully rewritten with four subcommands: create, submit, status, download. No longer uses REQUESTS_DIR / RAW_DIR / RUNS_DIR.
- Job directories: analysis/gpt4o/jobs/<stem[:8]>-N/ (e.g. f452-1). Each run is self-contained: forum.jsonl, prompt.txt, report.json, jobN-input.jsonl, jobN-output-raw.jsonl, jobN-output.jsonl, jobN-errors.jsonl.
- analysis/tokenizer.py: new standalone script; imports openai_batch for MODEL_LIMITS, estimate_tokens, build_messages. Reads input JSONL + prompt, computes per-model jobs/cost/time table, writes reports/<stem>-report.json. MODEL_PRICING dict lives here (not in openai_batch). Pass a jobN-input.jsonl to count actual tokens instead.
- analysis/openai_batch.py: fully rewritten with four subcommands: create, submit, status, download. Job dirs at analysis/jobs/<stem[:8]>-N/.
- Job directories: analysis/jobs/<stem[:8]>-N/ (e.g. f452-1). Each run is self-contained: forum.jsonl, prompt.txt, report.json, jobN-input.jsonl, jobN-output-raw.jsonl, jobN-output.jsonl, jobN-errors.jsonl.
- status.json: tracks all jobs with pending/submitted/in_progress/completed/failed states. Updated by submit, status, download.
- _find_next_eligible_job: pure function for testability. Returns (next_pending_job, None) or (None, warning). Blocks submission if previous job is in_progress/submitted.
- create: no API key required. Reads report.json, re-chunks comments, writes all jobN-input.jsonl files, writes status.json.
- submit: uploads jobN-input.jsonl to Files API, creates batch, updates status.json to 'submitted'. Will not stack batches.
- status: retrieves batch from OpenAI, updates status.json counts and status.
- download: auto-runs status first, downloads output_file_id → jobN-output-raw.jsonl, error_file_id → jobN-errors.jsonl, normalizes → jobN-output.jsonl. Updates status.json.
- tests/test_tokenizer.py: 15 tests for compute_report schema, cost/time calculation, MODEL_PRICING coverage, print_table output, report.json round-trip.
- tests/tokenizer.py: 19 tests for compute_report schema, cost/time calculation, MODEL_PRICING coverage, print_table output, count_input_tokens, report.json round-trip.
- Token limit buffer: _LIMIT_BUFFER=0.80 (20% headroom). Estimate uses OpenAI cookbook chat formula (role tokens + 3-token reply primer). Verify a job file with: python analysis/tokenizer.py analysis/jobs/<dir>/jobN-input.jsonl
*** usage
#+begin_src sh
#+begin_src powershell
# 1. estimate tokens and cost
python analysis/gpt4o/tokenizer.py output/f452.jsonl --prompt analysis/prompt-1.txt
# writes output/report.json
python analysis/tokenizer.py output/f452.jsonl --prompt analysis/prompt-1.txt
# writes reports/f452-report.json
# 2. create job directory (no api key needed)
python analysis/gpt4o/analysis_batch.py create output/report.json --model gpt-4o-mini
# creates analysis/gpt4o/jobs/f452-1/
# 2. verify actual tokens in a job file (optional sanity check)
python analysis/tokenizer.py analysis/jobs/f452-1/job1-input.jsonl
# 3. submit first job
python analysis/gpt4o/analysis_batch.py submit
# 3. create job directory (no api key needed)
python analysis/openai_batch.py create reports/f452-report.json --model gpt-5.4-mini
# creates analysis/jobs/f452-1/
# 4. check status (repeat until completed)
python analysis/gpt4o/analysis_batch.py status
# 4. submit first job
python analysis/openai_batch.py submit
# 5. download and normalize
python analysis/gpt4o/analysis_batch.py download
# 5. check status (repeat until completed)
python analysis/openai_batch.py status
# 6. submit next job (if multi-job run), then repeat 4-5
python analysis/gpt4o/analysis_batch.py submit
# 6. download and normalize
python analysis/openai_batch.py download
# 7. submit next job (if multi-job run), then repeat 5-6
python analysis/openai_batch.py submit
#+end_src
** evidence
- commit:
- tests: passing (pytest tests/analysis_gpt4o_batch.py tests/test_tokenizer.py)
- datetime: [2026-05-05 Tue]
- tests: passing (pytest tests/openai_batch.py tests/openai_realtime.py tests/tokenizer.py)
- datetime: [2026-05-06 Wed]
* === Backlog ===
* [ ] X: analysis validation view

43
reports/f452-1.json Normal file
View File

@@ -0,0 +1,43 @@
{
"prompt": "analysis\\prompt-1.txt",
"prompt_hash": "cb41250",
"input_file": "output\\f452.jsonl",
"input_sha256": "59dcc8b13cc2a386977a8b934c498c7e639b7e684a94ca1bfd10a14878670018",
"total_comments": 9083,
"input_tokens": 6397254,
"gpt-5.5": {
"jobs": 9,
"cost_$": 15.9931,
"est_queue_days": 7.11
},
"gpt-5.4": {
"jobs": 9,
"cost_$": 7.9966,
"est_queue_days": 7.11
},
"gpt-5.4-mini": {
"jobs": 4,
"cost_$": 2.399,
"est_queue_days": 3.2
},
"gpt-5.4-nano": {
"jobs": 40,
"cost_$": 0.6397,
"est_queue_days": 31.99
},
"gpt-4o": {
"jobs": 9,
"cost_$": 7.9966,
"est_queue_days": 7.11
},
"gpt-4o-mini": {
"jobs": 4,
"cost_$": 0.4798,
"est_queue_days": 3.2
},
"gpt-o4-mini": {
"jobs": 4,
"cost_$": 3.5185,
"est_queue_days": 3.2
}
}

View File

@@ -1,4 +1,4 @@
"""Unit tests for analysis/gpt4o/analysis_batch.py — no real API calls."""
"""Unit tests for analysis/openai_batch.py — no real API calls."""
import json
import sys
@@ -7,8 +7,8 @@ from unittest.mock import MagicMock
import pytest
sys.path.insert(0, str(Path(__file__).parent.parent / "analysis" / "gpt4o"))
import analysis_batch as bt
sys.path.insert(0, str(Path(__file__).parent.parent / "analysis"))
import openai_batch as bt
# ---------------------------------------------------------------------------
@@ -101,7 +101,7 @@ def test_prompt_version_is_7_hex_chars():
def test_prompt_version_matches_realtime():
"""Both scripts must derive the same PROMPT_VERSION from the same file."""
import analysis_realtime as rt
import openai_realtime as rt
assert bt.PROMPT_VERSION == rt.PROMPT_VERSION
@@ -242,7 +242,8 @@ def test_estimate_tokens_fallback_without_tiktoken(monkeypatch):
monkeypatch.setitem(_sys.modules, "tiktoken", None)
messages = [{"role": "user", "content": "x" * 300}]
result = bt.estimate_tokens(messages, "gpt-4o")
assert result == 4 + 300 // 3
# fallback: 3 primer + (3 + 300//3) per message
assert result == 3 + (3 + 300 // 3)
# ---------------------------------------------------------------------------

View File

@@ -1,4 +1,4 @@
"""Unit tests for analysis/gpt4o/analysis_realtime.py — no real API calls."""
"""Unit tests for analysis/openai_realtime.py — no real API calls."""
import json
import sys
@@ -7,8 +7,8 @@ from unittest.mock import MagicMock
import pytest
sys.path.insert(0, str(Path(__file__).parent.parent / "analysis" / "gpt4o"))
import analysis_realtime as rt
sys.path.insert(0, str(Path(__file__).parent.parent / "analysis"))
import openai_realtime as rt
# ---------------------------------------------------------------------------

View File

@@ -1,4 +1,4 @@
"""Unit tests for analysis/gpt4o/tokenizer.py — no real API calls."""
"""Unit tests for analysis/tokenizer.py — no real API calls."""
import io
import json
@@ -9,9 +9,9 @@ from unittest.mock import patch
import pytest
sys.path.insert(0, str(Path(__file__).parent.parent / "analysis" / "gpt4o"))
sys.path.insert(0, str(Path(__file__).parent.parent / "analysis"))
import tokenizer as tk
import analysis_batch as ab
import openai_batch as ab
# ---------------------------------------------------------------------------
@@ -199,3 +199,52 @@ def test_report_json_roundtrip(tmp_path):
assert loaded["total_comments"] == report["total_comments"]
assert loaded["input_tokens"] == report["input_tokens"]
assert loaded["gpt-4o-mini"]["jobs"] == report["gpt-4o-mini"]["jobs"]
# ---------------------------------------------------------------------------
# count_input_tokens
def _make_job_input(tmp_path, comments, forum=None) -> Path:
"""Write a batch request JSONL in the same format as job1-input.jsonl."""
p = tmp_path / "job1-input.jsonl"
with open(p, "w", encoding="utf-8") as f:
for c in comments:
f.write(json.dumps(ab.build_batch_request_line(c, forum, "gpt-4o-mini")) + "\n")
return p
def test_count_input_tokens_matches_estimate(tmp_path):
"""count_input_tokens on a freshly written job file equals the sum estimate_tokens produces."""
p = _make_job_input(tmp_path, COMMENTS, FORUM_ITEM)
result = tk.count_input_tokens(p, "gpt-4o-mini")
expected = sum(
ab.estimate_tokens(ab.build_messages(c, FORUM_ITEM)[0], "gpt-4o-mini")
for c in COMMENTS
)
assert result["total_tokens"] == expected
assert result["total_requests"] == len(COMMENTS)
def test_count_input_tokens_fields(tmp_path):
p = _make_job_input(tmp_path, COMMENTS, FORUM_ITEM)
result = tk.count_input_tokens(p)
assert set(result.keys()) == {"total_tokens", "total_requests", "min", "max", "mean"}
assert result["min"] <= result["mean"] <= result["max"]
assert result["min"] > 0
def test_count_input_tokens_empty_file(tmp_path):
p = tmp_path / "empty.jsonl"
p.write_text("", encoding="utf-8")
result = tk.count_input_tokens(p)
assert result["total_tokens"] == 0
assert result["total_requests"] == 0
def test_count_input_tokens_includes_system_prompt(tmp_path):
"""Token count must be higher than user-message-only text length / 3 (prompt adds tokens)."""
p = _make_job_input(tmp_path, [COMMENT_A], FORUM_ITEM)
result = tk.count_input_tokens(p)
user_chars = len(COMMENT_A.get("text", ""))
# system prompt alone is hundreds of tokens; total must exceed naive user-text estimate
assert result["total_tokens"] > user_chars // 3