completed openai batch work

This commit is contained in:
2026-05-07 07:24:11 -04:00
parent 64a7a18721
commit f5d679808e
29 changed files with 36711 additions and 83 deletions

View File

@@ -43,3 +43,4 @@ Description and PM notes
- project dir: `%userprofile%\projects\vath\` - project dir: `%userprofile%\projects\vath\`
- python venv: `%userprofile%\projects\vath\venv\scripts\activate` - python venv: `%userprofile%\projects\vath\venv\scripts\activate`
- pytest (inside venv): `python -m pytest tests/` - pytest (inside venv): `python -m pytest tests/`
- create tests without `test_` prefix, ie: `tests/tokenizer.py` not `tests/test_tokenizer.py`

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,23 @@
You are an expert policy analyst classifying public comments submitted to the Virginia Town Hall
regulatory comment system. You will be given the text of a proposed regulation and a single
public comment. Return ONLY a JSON object — no other text.
Definitions:
- stance: the commenter's position on whether the regulation should be adopted.
"support" = wants it approved (as-is or with changes);
"oppose" = wants it rejected or substantially weakened;
"neutral" = takes no position, asks a question, or provides factual input only;
"unknown" = too vague, off-topic, or uninterpretable to classify.
- tone: the emotional register of the writing, independent of stance.
"positive" = affirming, hopeful, appreciative;
"negative" = angry, fearful, alarmed, or contemptuous;
"neutral" = matter-of-fact, procedural, or informational;
"mixed" = contains both positive and negative emotional content;
"unclear" = tone cannot be determined (e.g., a one-word comment).
- stance_confidence: float 0.0-1.0, your confidence in the stance label.
- stance_rationale: 1-3 sentences explaining the key evidence; quote specific phrases where possible.
- tags: up to 5 short topic labels relevant to the comment's specific concerns (e.g.
"parental rights", "student safety", "privacy", "religious freedom", "LGBTQ+ inclusion",
"bullying prevention", "school sports", "bathroom access"). Empty array if none apply.
Return exactly these keys: stance, stance_confidence, stance_rationale, tone, tags.

View File

@@ -0,0 +1,43 @@
{
"prompt": "analysis\\prompt-1.txt",
"prompt_hash": "cb41250",
"input_file": "output\\f452.jsonl",
"input_sha256": "59dcc8b13cc2a386977a8b934c498c7e639b7e684a94ca1bfd10a14878670018",
"total_comments": 9083,
"input_tokens": 6397254,
"gpt-5.5": {
"jobs": 9,
"cost_$": 15.9931,
"est_queue_days": 7.11
},
"gpt-5.4": {
"jobs": 9,
"cost_$": 7.9966,
"est_queue_days": 7.11
},
"gpt-5.4-mini": {
"jobs": 4,
"cost_$": 2.399,
"est_queue_days": 3.2
},
"gpt-5.4-nano": {
"jobs": 40,
"cost_$": 0.6397,
"est_queue_days": 31.99
},
"gpt-4o": {
"jobs": 9,
"cost_$": 7.9966,
"est_queue_days": 7.11
},
"gpt-4o-mini": {
"jobs": 4,
"cost_$": 0.4798,
"est_queue_days": 3.2
},
"gpt-o4-mini": {
"jobs": 4,
"cost_$": 3.5185,
"est_queue_days": 3.2
}
}

View File

@@ -0,0 +1,57 @@
{
"model": "gpt-5.4-mini",
"prompt_hash": "cb41250",
"input_file": "output\\f452.jsonl",
"input_sha256": "59dcc8b13cc2a386977a8b934c498c7e639b7e684a94ca1bfd10a14878670018",
"total_comments": 9083,
"input_tokens": 6397254,
"est_queue_days": 3.2,
"cost_$": 2.399,
"total_jobs": 4,
"jobs": [
{
"job_num": 1,
"run_id": "76c97113-63aa-43db-8f84-9c60ebcbb105",
"status": "completed",
"batch_id": "batch_69fb9081639881909be0c40d86edd747",
"records_submitted": 2270,
"records_completed": 2270,
"records_failed": 0,
"submitted_at": "2026-05-06T19:03:28.949240+00:00",
"completed_at": "2026-05-06T20:09:14+00:00"
},
{
"job_num": 2,
"run_id": "b8f3b0bb-f155-4a5c-acce-f3504c0e09aa",
"status": "completed",
"batch_id": "batch_69fba02df7b481909e96afa1ee8879f5",
"records_submitted": 2274,
"records_completed": 2274,
"records_failed": 0,
"submitted_at": "2026-05-06T20:10:21.424330+00:00",
"completed_at": "2026-05-06T20:37:11+00:00"
},
{
"job_num": 3,
"run_id": "8d769f37-6beb-4a1b-87ee-3f66cdc6adc8",
"status": "completed",
"batch_id": "batch_69fba69a85488190977792b6f95b614b",
"records_submitted": 2282,
"records_completed": 2282,
"records_failed": 0,
"submitted_at": "2026-05-06T20:37:45.586815+00:00",
"completed_at": "2026-05-06T21:09:24+00:00"
},
{
"job_num": 4,
"run_id": "e6affbc2-ddc9-43a6-b8e9-d1f47e736283",
"status": "completed",
"batch_id": "batch_69fbe44565748190ad19f17ee3143f8d",
"records_submitted": 2257,
"records_completed": 2257,
"records_failed": 0,
"submitted_at": "2026-05-07T01:00:52.886953+00:00",
"completed_at": "2026-05-07T09:20:01+00:00"
}
]
}

View File

@@ -1,6 +1,6 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
analysis_batch.py OpenAI Batch API job runner openai_batch.py OpenAI Batch API job runner
Run tokenizer.py first to generate report.json, then: Run tokenizer.py first to generate report.json, then:
create <report.json> --model <model> build job directory create <report.json> --model <model> build job directory
@@ -8,7 +8,7 @@ Run tokenizer.py first to generate report.json, then:
status [--job N] [--dir DIR] check job status status [--job N] [--dir DIR] check job status
download [--job N] [--dir DIR] download + normalize completed jobs download [--job N] [--dir DIR] download + normalize completed jobs
DIR is a name under analysis/gpt4o/jobs/ (default: most recently created). DIR is a name under analysis/jobs/ (default: most recently created).
""" """
import argparse import argparse
@@ -52,17 +52,24 @@ _MODEL_ENCODING: dict[str, str] = {
"gpt-4o-mini": "o200k_base", "gpt-4o-mini": "o200k_base",
"gpt-o4-mini": "o200k_base", "gpt-o4-mini": "o200k_base",
} }
_LIMIT_BUFFER = 0.90 _LIMIT_BUFFER = 0.80
def estimate_tokens(messages: list[dict], model: str) -> int: def estimate_tokens(messages: list[dict], model: str) -> int:
"""Exact token count via tiktoken; falls back to chars/3 + 4 overhead per message.""" """Token count per OpenAI cookbook chat formula; falls back to chars/3."""
try: try:
import tiktoken import tiktoken
enc = tiktoken.get_encoding(_MODEL_ENCODING.get(model, "o200k_base")) enc = tiktoken.get_encoding(_MODEL_ENCODING.get(model, "o200k_base"))
return sum(4 + len(enc.encode(m["content"])) for m in messages) # Per OpenAI cookbook for gpt-4o: 3 overhead per message + role + content;
# plus 3 tokens for the reply primer (<|start|>assistant<|message|>).
total = 3 # reply primer
for m in messages:
total += 3
total += len(enc.encode(m.get("role", "")))
total += len(enc.encode(m["content"]))
return total
except ImportError: except ImportError:
return sum(4 + len(m["content"]) // 3 for m in messages) return 3 + sum(3 + len(m["content"]) // 3 for m in messages)
def chunk_comments_by_tokens( def chunk_comments_by_tokens(
@@ -91,7 +98,7 @@ def chunk_comments_by_tokens(
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Prompt # Prompt
_DEFAULT_PROMPT_FILE = Path(__file__).parent.parent / "prompt-1.txt" _DEFAULT_PROMPT_FILE = Path(__file__).parent / "prompt-1.txt"
SYSTEM_PROMPT = _DEFAULT_PROMPT_FILE.read_text(encoding="utf-8").strip() SYSTEM_PROMPT = _DEFAULT_PROMPT_FILE.read_text(encoding="utf-8").strip()
PROMPT_VERSION = hashlib.sha256(SYSTEM_PROMPT.encode("utf-8")).hexdigest()[:7] PROMPT_VERSION = hashlib.sha256(SYSTEM_PROMPT.encode("utf-8")).hexdigest()[:7]
@@ -375,7 +382,7 @@ def cmd_create(args) -> None:
print(f"Created: {job_dir.name}") print(f"Created: {job_dir.name}")
print(f" {len(chunks)} job(s) | {len(comments)} comments | model: {args.model}") print(f" {len(chunks)} job(s) | {len(comments)} comments | model: {args.model}")
print(f"\nNext: python analysis/gpt4o/analysis_batch.py submit") print(f"\nNext: python analysis/openai_batch.py submit")
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -431,7 +438,7 @@ def cmd_submit(args, client) -> None:
save_status(status, job_dir) save_status(status, job_dir)
print(f"Job {n} submitted: {batch.id} ({batch.status})") print(f"Job {n} submitted: {batch.id} ({batch.status})")
print(f" python analysis/gpt4o/analysis_batch.py status") print(f" python analysis/openai_batch.py status")
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------

View File

@@ -1,12 +1,12 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
""" """
analysis/gpt4o/analysis-realtime.py Synchronous GPT-4o pipeline for VA Townhall comments. analysis/openai_realtime.py Synchronous GPT-4o pipeline for VA Townhall comments.
Usage: Usage:
python analysis/gpt4o/analysis-realtime.py <input_jsonl> [--limit {5,10,20,50}] [--model MODEL] python analysis/openai_realtime.py <input_jsonl> [--limit {5,10,20,50}] [--model MODEL]
Output: Output:
analysis/gpt4o/forum{id}_{scrape_ts}_{model}_{run_ts}.jsonl analysis/forum{id}_{scrape_ts}_{model}_{run_ts}.jsonl
""" """
import argparse import argparse
@@ -30,7 +30,7 @@ except ImportError:
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Prompt — loaded from analysis/prompt-1.txt at import time # Prompt — loaded from analysis/prompt-1.txt at import time
_PROMPT_FILE = Path(__file__).parent.parent / "prompt-1.txt" _PROMPT_FILE = Path(__file__).parent / "prompt-1.txt"
SYSTEM_PROMPT = _PROMPT_FILE.read_text(encoding="utf-8").strip() SYSTEM_PROMPT = _PROMPT_FILE.read_text(encoding="utf-8").strip()
PROMPT_VERSION = hashlib.sha256(SYSTEM_PROMPT.encode("utf-8")).hexdigest()[:7] PROMPT_VERSION = hashlib.sha256(SYSTEM_PROMPT.encode("utf-8")).hexdigest()[:7]

View File

@@ -3,10 +3,11 @@
tokenizer.py estimate token usage and cost for a batch analysis run. tokenizer.py estimate token usage and cost for a batch analysis run.
Usage: Usage:
python analysis/gpt4o/tokenizer.py output/f452.jsonl [--prompt analysis/prompt-1.txt] python analysis/tokenizer.py output/f452.jsonl [--prompt analysis/prompt-1.txt]
python analysis/tokenizer.py analysis/jobs/f452-1/job1-input.jsonl # count actual tokens in a job
Prints a per-model comparison table and writes report.json next to the input file. Prints a per-model comparison table and writes reports/<stem>-report.json.
Run this before analysis_batch.py create. Run this before openai_batch.py create.
""" """
import argparse import argparse
@@ -17,7 +18,7 @@ import sys
from pathlib import Path from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent)) sys.path.insert(0, str(Path(__file__).parent))
import analysis_batch as _ab import openai_batch as _ab
# Input pricing ($/1M tokens, batch API) — from docs/openai.md, updated 2026-05-05. # Input pricing ($/1M tokens, batch API) — from docs/openai.md, updated 2026-05-05.
# Add Anthropic/other models here when needed; only models with a LIMITS entry are reported. # Add Anthropic/other models here when needed; only models with a LIMITS entry are reported.
@@ -66,6 +67,32 @@ def compute_report(
return report return report
def count_input_tokens(path: Path, model: str = "gpt-4o") -> dict:
"""Count tokens in an existing job input JSONL (batch request format).
Each line must have body.messages (as written by build_batch_request_line).
Returns {"total_tokens": int, "total_requests": int, "min": int, "max": int, "mean": float}.
"""
counts = []
with open(path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
req = json.loads(line)
messages = req["body"]["messages"]
counts.append(_ab.estimate_tokens(messages, model))
if not counts:
return {"total_tokens": 0, "total_requests": 0, "min": 0, "max": 0, "mean": 0.0}
return {
"total_tokens": sum(counts),
"total_requests": len(counts),
"min": min(counts),
"max": max(counts),
"mean": round(sum(counts) / len(counts), 1),
}
def print_table(report: dict) -> None: def print_table(report: dict) -> None:
"""Print a human-readable model comparison table to stdout.""" """Print a human-readable model comparison table to stdout."""
print(f"\nInput: {report['input_file']}") print(f"\nInput: {report['input_file']}")
@@ -90,11 +117,21 @@ def print_table(report: dict) -> None:
print() print()
def _is_job_input(path: Path) -> bool:
"""Return True if this JSONL looks like a batch request file (has custom_id)."""
with open(path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if line:
return "custom_id" in json.loads(line)
return False
def main() -> None: def main() -> None:
_default_prompt = Path(__file__).parent.parent / "prompt-1.txt" _default_prompt = Path(__file__).parent / "prompt-1.txt"
parser = argparse.ArgumentParser(description="Estimate batch token usage and cost.") parser = argparse.ArgumentParser(description="Estimate batch token usage and cost.")
parser.add_argument("input", help="Scraped JSONL file") parser.add_argument("input", help="Scraped JSONL or job input JSONL (jobN-input.jsonl)")
parser.add_argument( parser.add_argument(
"--prompt", "--prompt",
default=str(_default_prompt), default=str(_default_prompt),
@@ -106,6 +143,16 @@ def main() -> None:
if not input_path.exists(): if not input_path.exists():
sys.exit(f"File not found: {input_path}") sys.exit(f"File not found: {input_path}")
# --- Mode: count tokens in an existing job input file ---
if _is_job_input(input_path):
result = count_input_tokens(input_path)
print(f"\nJob input: {input_path.name}")
print(f" Requests : {result['total_requests']:,}")
print(f" Tokens : {result['total_tokens']:,}")
print(f" Per-req : min={result['min']} max={result['max']} mean={result['mean']}")
return
# --- Mode: estimate from raw scrape file and write report.json ---
prompt_path = Path(args.prompt) prompt_path = Path(args.prompt)
if not prompt_path.exists(): if not prompt_path.exists():
sys.exit(f"Prompt file not found: {prompt_path}") sys.exit(f"Prompt file not found: {prompt_path}")
@@ -131,10 +178,12 @@ def main() -> None:
print_table(report) print_table(report)
out_path = input_path.parent / "report.json" reports_dir = Path(__file__).parent.parent / "reports"
reports_dir.mkdir(exist_ok=True)
out_path = reports_dir / f"{input_path.stem}-report.json"
out_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8") out_path.write_text(json.dumps(report, indent=2, ensure_ascii=False), encoding="utf-8")
print(f"Report written to: {out_path}") print(f"Report written to: {out_path}")
print(f"\nNext: python analysis/gpt4o/analysis_batch.py create {out_path} --model <model>") print(f"\nNext: python analysis/openai_batch.py create {out_path} --model <model>")
if __name__ == "__main__": if __name__ == "__main__":

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 130 KiB

View File

@@ -1,9 +1,18 @@
<mxfile host="app.diagrams.net"> <mxfile host="app.diagrams.net">
<diagram name="Page-1" id="0sW-Vs8X5usvYmJikUIv"> <diagram name="Page-1" id="0sW-Vs8X5usvYmJikUIv">
<mxGraphModel dx="2179" dy="1118" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="0" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0"> <mxGraphModel dx="1315" dy="798" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="0" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0">
<root> <root>
<mxCell id="0" /> <mxCell id="0" />
<mxCell id="1" parent="0" /> <mxCell id="1" parent="0" />
<mxCell id="mENAtx_syaeSO5uR6kG6-61" parent="1" style="rounded=0;whiteSpace=wrap;html=1;" value="" vertex="1">
<mxGeometry height="90" width="190" x="1000" y="330" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-60" parent="1" style="rounded=0;whiteSpace=wrap;html=1;" value="" vertex="1">
<mxGeometry height="90" width="190" x="1010" y="340" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-59" parent="1" style="rounded=0;whiteSpace=wrap;html=1;" value="" vertex="1">
<mxGeometry height="90" width="190" x="1020" y="350" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-3" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-1" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0;entryDx=16.5;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-29"> <mxCell id="mENAtx_syaeSO5uR6kG6-3" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-1" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0;entryDx=16.5;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-29">
<mxGeometry relative="1" as="geometry"> <mxGeometry relative="1" as="geometry">
<mxPoint x="200" y="290" as="targetPoint" /> <mxPoint x="200" y="290" as="targetPoint" />
@@ -18,18 +27,18 @@
<mxCell id="mENAtx_syaeSO5uR6kG6-5" parent="1" style="shape=process;whiteSpace=wrap;html=1;backgroundOutline=1;" value="tokenizer" vertex="1"> <mxCell id="mENAtx_syaeSO5uR6kG6-5" parent="1" style="shape=process;whiteSpace=wrap;html=1;backgroundOutline=1;" value="tokenizer" vertex="1">
<mxGeometry height="60" width="120" x="400" y="170" as="geometry" /> <mxGeometry height="60" width="120" x="400" y="170" as="geometry" />
</mxCell> </mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-6" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=center;verticalAlign=top;rounded=0;" value="gather forum data" vertex="1"> <mxCell id="mENAtx_syaeSO5uR6kG6-6" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="&lt;div align=&quot;left&quot;&gt;- collect forum data&lt;/div&gt;" vertex="1">
<mxGeometry height="60" width="120" x="20" y="240" as="geometry" /> <mxGeometry height="60" width="120" x="40" y="240" as="geometry" />
</mxCell> </mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-7" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="&lt;div&gt;tokenize forum,&lt;/div&gt;&lt;div&gt;generate report w/&lt;/div&gt;&lt;div&gt;recommendations&lt;/div&gt;" vertex="1"> <mxCell id="mENAtx_syaeSO5uR6kG6-7" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="&lt;div&gt;- tokenize forum&lt;/div&gt;&lt;div&gt;- generate report w/&lt;/div&gt;&lt;div&gt;recommendations&lt;/div&gt;" vertex="1">
<mxGeometry height="60" width="120" x="400" y="240" as="geometry" /> <mxGeometry height="60" width="120" x="400" y="240" as="geometry" />
</mxCell> </mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-28" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-35"> <mxCell id="mENAtx_syaeSO5uR6kG6-28" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" target="mENAtx_syaeSO5uR6kG6-73">
<mxGeometry relative="1" as="geometry"> <mxGeometry relative="1" as="geometry">
<mxPoint x="910" y="270" as="targetPoint" /> <mxPoint x="953" y="240" as="targetPoint" />
</mxGeometry> </mxGeometry>
</mxCell> </mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-19" parent="1" style="shape=process;whiteSpace=wrap;html=1;backgroundOutline=1;" value="batch" vertex="1"> <mxCell id="mENAtx_syaeSO5uR6kG6-19" parent="1" style="shape=process;whiteSpace=wrap;html=1;backgroundOutline=1;" value="openai_batch" vertex="1">
<mxGeometry height="60" width="120" x="720" y="170" as="geometry" /> <mxGeometry height="60" width="120" x="720" y="170" as="geometry" />
</mxCell> </mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-21" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=right;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="&lt;div&gt;--model&lt;/div&gt;&lt;div&gt;--limit&lt;/div&gt;" vertex="1"> <mxCell id="mENAtx_syaeSO5uR6kG6-21" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=right;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="&lt;div&gt;--model&lt;/div&gt;&lt;div&gt;--limit&lt;/div&gt;" vertex="1">
@@ -38,11 +47,8 @@
<mxCell id="mENAtx_syaeSO5uR6kG6-23" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=right;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="--forum" vertex="1"> <mxCell id="mENAtx_syaeSO5uR6kG6-23" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=right;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="--forum" vertex="1">
<mxGeometry height="60" width="120" x="-90" y="170" as="geometry" /> <mxGeometry height="60" width="120" x="-90" y="170" as="geometry" />
</mxCell> </mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-25" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=right;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="--prompt" vertex="1"> <mxCell id="mENAtx_syaeSO5uR6kG6-26" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="&lt;div&gt;- split job into batches&lt;/div&gt;&lt;div&gt;- submit first batch&lt;/div&gt;&lt;div&gt;- status of current batch&lt;/div&gt;&lt;div&gt;- download batch artifacts&lt;/div&gt;" vertex="1">
<mxGeometry height="60" width="120" x="270" y="210" as="geometry" /> <mxGeometry height="70" width="140" x="720" y="240" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-26" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="&lt;div&gt;split job into batches&lt;/div&gt;&lt;div&gt;submit first batch&lt;/div&gt;&lt;div&gt;status of current batch&lt;/div&gt;&lt;div&gt;download batch artifacts&lt;/div&gt;" vertex="1">
<mxGeometry height="60" width="120" x="720" y="240" as="geometry" />
</mxCell> </mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-29" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="" vertex="1"> <mxCell id="mENAtx_syaeSO5uR6kG6-29" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="" vertex="1">
<mxGeometry height="70" width="50" x="210" y="240" as="geometry" /> <mxGeometry height="70" width="50" x="210" y="240" as="geometry" />
@@ -58,7 +64,7 @@
</Array> </Array>
</mxGeometry> </mxGeometry>
</mxCell> </mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-31" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;forum&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1"> <mxCell id="mENAtx_syaeSO5uR6kG6-31" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;&amp;lt;forumid&amp;gt;&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="230" y="260" as="geometry" /> <mxGeometry height="70" width="50" x="230" y="260" as="geometry" />
</mxCell> </mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-47" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-34" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0;exitDx=50;exitDy=43.5;exitPerimeter=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" target="mENAtx_syaeSO5uR6kG6-19"> <mxCell id="mENAtx_syaeSO5uR6kG6-47" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-34" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0;exitDx=50;exitDy=43.5;exitPerimeter=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" target="mENAtx_syaeSO5uR6kG6-19">
@@ -69,30 +75,42 @@
</Array> </Array>
</mxGeometry> </mxGeometry>
</mxCell> </mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-34" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;report&lt;/div&gt;&lt;div&gt;.json&lt;/div&gt;" vertex="1"> <mxCell id="mENAtx_syaeSO5uR6kG6-34" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;&lt;br&gt;&lt;/div&gt;&lt;div&gt;&amp;lt;forumid&amp;gt;&lt;br&gt;-report&lt;/div&gt;&lt;div&gt;.json&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="560" y="240" as="geometry" /> <mxGeometry height="70" width="50" x="560" y="240" as="geometry" />
</mxCell> </mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-35" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="job.json" vertex="1"> <mxCell id="mENAtx_syaeSO5uR6kG6-35" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;status&lt;/div&gt;&lt;div&gt;.json&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="890" y="240" as="geometry" /> <mxGeometry height="70" width="50" x="913.25" y="360" as="geometry" />
</mxCell> </mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-41" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="" vertex="1"> <mxCell id="mENAtx_syaeSO5uR6kG6-43" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;jobN-&lt;/div&gt;&lt;div&gt;output&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="940" y="340" as="geometry" /> <mxGeometry height="70" width="50" x="1090" y="360" as="geometry" />
</mxCell> </mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-42" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="" vertex="1"> <mxCell id="mENAtx_syaeSO5uR6kG6-48" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;jobN-errors&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="950" y="350" as="geometry" /> <mxGeometry height="70" width="50" x="1150" y="360" as="geometry" />
</mxCell> </mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-43" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;batchN-&lt;/div&gt;&lt;div&gt;output-&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1"> <mxCell id="mENAtx_syaeSO5uR6kG6-54" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;jobN-&lt;/div&gt;&lt;div&gt;input&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="960" y="360" as="geometry" /> <mxGeometry height="70" width="50" x="1030" y="360" as="geometry" />
</mxCell> </mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-48" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;errors&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1"> <mxCell id="mENAtx_syaeSO5uR6kG6-64" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-63" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0;exitDx=50;exitDy=43.5;exitPerimeter=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" target="mENAtx_syaeSO5uR6kG6-5">
<mxGeometry height="70" width="50" x="980" y="240" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-51" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0;entryDx=16.5;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-41">
<mxGeometry relative="1" as="geometry" /> <mxGeometry relative="1" as="geometry" />
</mxCell> </mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-53" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0;entryDx=16.5;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-48"> <mxCell id="mENAtx_syaeSO5uR6kG6-63" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;prompt&lt;/div&gt;&lt;div&gt;.txt&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="270" y="90" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-67" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="create" vertex="1">
<mxGeometry height="20" width="120" x="850" y="170" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-71" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="&lt;div&gt;submit&lt;/div&gt;&lt;div&gt;&lt;br&gt;&lt;/div&gt;&lt;div&gt;status&lt;/div&gt;&lt;div&gt;download&lt;/div&gt;" vertex="1">
<mxGeometry height="60" width="120" x="1020" y="240" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-75" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-73" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;" target="mENAtx_syaeSO5uR6kG6-35">
<mxGeometry relative="1" as="geometry" /> <mxGeometry relative="1" as="geometry" />
</mxCell> </mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-76" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-73" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" target="mENAtx_syaeSO5uR6kG6-61">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-73" parent="1" style="image;aspect=fixed;perimeter=ellipsePerimeter;html=1;align=center;shadow=0;dashed=0;spacingTop=3;image=img/lib/active_directory/folder.svg;" value="&amp;lt;forumid&amp;gt;-N" vertex="1">
<mxGeometry height="50" width="36.5" x="920" y="240" as="geometry" />
</mxCell>
</root> </root>
</mxGraphModel> </mxGraphModel>
</diagram> </diagram>

4
docs/pipeline-v1.2.3.svg Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 170 KiB

View File

@@ -201,44 +201,48 @@ batch processing should be a resumable job queue, not a one-shot script. the us
- remaining-comment detection - remaining-comment detection
** notes ** notes
- analysis/gpt4o/tokenizer.py: new standalone script; imports analysis_batch for MODEL_LIMITS, estimate_tokens, build_messages. Reads input JSONL + prompt, computes per-model jobs/cost/time table, writes report.json to input file's directory. MODEL_PRICING dict lives here (not in analysis_batch). - analysis/tokenizer.py: new standalone script; imports openai_batch for MODEL_LIMITS, estimate_tokens, build_messages. Reads input JSONL + prompt, computes per-model jobs/cost/time table, writes reports/<stem>-report.json. MODEL_PRICING dict lives here (not in openai_batch). Pass a jobN-input.jsonl to count actual tokens instead.
- analysis/gpt4o/analysis_batch.py: fully rewritten with four subcommands: create, submit, status, download. No longer uses REQUESTS_DIR / RAW_DIR / RUNS_DIR. - analysis/openai_batch.py: fully rewritten with four subcommands: create, submit, status, download. Job dirs at analysis/jobs/<stem[:8]>-N/.
- Job directories: analysis/gpt4o/jobs/<stem[:8]>-N/ (e.g. f452-1). Each run is self-contained: forum.jsonl, prompt.txt, report.json, jobN-input.jsonl, jobN-output-raw.jsonl, jobN-output.jsonl, jobN-errors.jsonl. - Job directories: analysis/jobs/<stem[:8]>-N/ (e.g. f452-1). Each run is self-contained: forum.jsonl, prompt.txt, report.json, jobN-input.jsonl, jobN-output-raw.jsonl, jobN-output.jsonl, jobN-errors.jsonl.
- status.json: tracks all jobs with pending/submitted/in_progress/completed/failed states. Updated by submit, status, download. - status.json: tracks all jobs with pending/submitted/in_progress/completed/failed states. Updated by submit, status, download.
- _find_next_eligible_job: pure function for testability. Returns (next_pending_job, None) or (None, warning). Blocks submission if previous job is in_progress/submitted. - _find_next_eligible_job: pure function for testability. Returns (next_pending_job, None) or (None, warning). Blocks submission if previous job is in_progress/submitted.
- create: no API key required. Reads report.json, re-chunks comments, writes all jobN-input.jsonl files, writes status.json. - create: no API key required. Reads report.json, re-chunks comments, writes all jobN-input.jsonl files, writes status.json.
- submit: uploads jobN-input.jsonl to Files API, creates batch, updates status.json to 'submitted'. Will not stack batches. - submit: uploads jobN-input.jsonl to Files API, creates batch, updates status.json to 'submitted'. Will not stack batches.
- status: retrieves batch from OpenAI, updates status.json counts and status. - status: retrieves batch from OpenAI, updates status.json counts and status.
- download: auto-runs status first, downloads output_file_id → jobN-output-raw.jsonl, error_file_id → jobN-errors.jsonl, normalizes → jobN-output.jsonl. Updates status.json. - download: auto-runs status first, downloads output_file_id → jobN-output-raw.jsonl, error_file_id → jobN-errors.jsonl, normalizes → jobN-output.jsonl. Updates status.json.
- tests/test_tokenizer.py: 15 tests for compute_report schema, cost/time calculation, MODEL_PRICING coverage, print_table output, report.json round-trip. - tests/tokenizer.py: 19 tests for compute_report schema, cost/time calculation, MODEL_PRICING coverage, print_table output, count_input_tokens, report.json round-trip.
- Token limit buffer: _LIMIT_BUFFER=0.80 (20% headroom). Estimate uses OpenAI cookbook chat formula (role tokens + 3-token reply primer). Verify a job file with: python analysis/tokenizer.py analysis/jobs/<dir>/jobN-input.jsonl
*** usage *** usage
#+begin_src sh #+begin_src powershell
# 1. estimate tokens and cost # 1. estimate tokens and cost
python analysis/gpt4o/tokenizer.py output/f452.jsonl --prompt analysis/prompt-1.txt python analysis/tokenizer.py output/f452.jsonl --prompt analysis/prompt-1.txt
# writes output/report.json # writes reports/f452-report.json
# 2. create job directory (no api key needed) # 2. verify actual tokens in a job file (optional sanity check)
python analysis/gpt4o/analysis_batch.py create output/report.json --model gpt-4o-mini python analysis/tokenizer.py analysis/jobs/f452-1/job1-input.jsonl
# creates analysis/gpt4o/jobs/f452-1/
# 3. submit first job # 3. create job directory (no api key needed)
python analysis/gpt4o/analysis_batch.py submit python analysis/openai_batch.py create reports/f452-report.json --model gpt-5.4-mini
# creates analysis/jobs/f452-1/
# 4. check status (repeat until completed) # 4. submit first job
python analysis/gpt4o/analysis_batch.py status python analysis/openai_batch.py submit
# 5. download and normalize # 5. check status (repeat until completed)
python analysis/gpt4o/analysis_batch.py download python analysis/openai_batch.py status
# 6. submit next job (if multi-job run), then repeat 4-5 # 6. download and normalize
python analysis/gpt4o/analysis_batch.py submit python analysis/openai_batch.py download
# 7. submit next job (if multi-job run), then repeat 5-6
python analysis/openai_batch.py submit
#+end_src #+end_src
** evidence ** evidence
- commit: - commit:
- tests: passing (pytest tests/analysis_gpt4o_batch.py tests/test_tokenizer.py) - tests: passing (pytest tests/openai_batch.py tests/openai_realtime.py tests/tokenizer.py)
- datetime: [2026-05-05 Tue] - datetime: [2026-05-06 Wed]
* === Backlog === * === Backlog ===
* [ ] X: analysis validation view * [ ] X: analysis validation view

43
reports/f452-1.json Normal file
View File

@@ -0,0 +1,43 @@
{
"prompt": "analysis\\prompt-1.txt",
"prompt_hash": "cb41250",
"input_file": "output\\f452.jsonl",
"input_sha256": "59dcc8b13cc2a386977a8b934c498c7e639b7e684a94ca1bfd10a14878670018",
"total_comments": 9083,
"input_tokens": 6397254,
"gpt-5.5": {
"jobs": 9,
"cost_$": 15.9931,
"est_queue_days": 7.11
},
"gpt-5.4": {
"jobs": 9,
"cost_$": 7.9966,
"est_queue_days": 7.11
},
"gpt-5.4-mini": {
"jobs": 4,
"cost_$": 2.399,
"est_queue_days": 3.2
},
"gpt-5.4-nano": {
"jobs": 40,
"cost_$": 0.6397,
"est_queue_days": 31.99
},
"gpt-4o": {
"jobs": 9,
"cost_$": 7.9966,
"est_queue_days": 7.11
},
"gpt-4o-mini": {
"jobs": 4,
"cost_$": 0.4798,
"est_queue_days": 3.2
},
"gpt-o4-mini": {
"jobs": 4,
"cost_$": 3.5185,
"est_queue_days": 3.2
}
}

View File

@@ -1,4 +1,4 @@
"""Unit tests for analysis/gpt4o/analysis_batch.py — no real API calls.""" """Unit tests for analysis/openai_batch.py — no real API calls."""
import json import json
import sys import sys
@@ -7,8 +7,8 @@ from unittest.mock import MagicMock
import pytest import pytest
sys.path.insert(0, str(Path(__file__).parent.parent / "analysis" / "gpt4o")) sys.path.insert(0, str(Path(__file__).parent.parent / "analysis"))
import analysis_batch as bt import openai_batch as bt
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -101,7 +101,7 @@ def test_prompt_version_is_7_hex_chars():
def test_prompt_version_matches_realtime(): def test_prompt_version_matches_realtime():
"""Both scripts must derive the same PROMPT_VERSION from the same file.""" """Both scripts must derive the same PROMPT_VERSION from the same file."""
import analysis_realtime as rt import openai_realtime as rt
assert bt.PROMPT_VERSION == rt.PROMPT_VERSION assert bt.PROMPT_VERSION == rt.PROMPT_VERSION
@@ -242,7 +242,8 @@ def test_estimate_tokens_fallback_without_tiktoken(monkeypatch):
monkeypatch.setitem(_sys.modules, "tiktoken", None) monkeypatch.setitem(_sys.modules, "tiktoken", None)
messages = [{"role": "user", "content": "x" * 300}] messages = [{"role": "user", "content": "x" * 300}]
result = bt.estimate_tokens(messages, "gpt-4o") result = bt.estimate_tokens(messages, "gpt-4o")
assert result == 4 + 300 // 3 # fallback: 3 primer + (3 + 300//3) per message
assert result == 3 + (3 + 300 // 3)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------

View File

@@ -1,4 +1,4 @@
"""Unit tests for analysis/gpt4o/analysis_realtime.py — no real API calls.""" """Unit tests for analysis/openai_realtime.py — no real API calls."""
import json import json
import sys import sys
@@ -7,8 +7,8 @@ from unittest.mock import MagicMock
import pytest import pytest
sys.path.insert(0, str(Path(__file__).parent.parent / "analysis" / "gpt4o")) sys.path.insert(0, str(Path(__file__).parent.parent / "analysis"))
import analysis_realtime as rt import openai_realtime as rt
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------

View File

@@ -1,4 +1,4 @@
"""Unit tests for analysis/gpt4o/tokenizer.py — no real API calls.""" """Unit tests for analysis/tokenizer.py — no real API calls."""
import io import io
import json import json
@@ -9,9 +9,9 @@ from unittest.mock import patch
import pytest import pytest
sys.path.insert(0, str(Path(__file__).parent.parent / "analysis" / "gpt4o")) sys.path.insert(0, str(Path(__file__).parent.parent / "analysis"))
import tokenizer as tk import tokenizer as tk
import analysis_batch as ab import openai_batch as ab
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@@ -199,3 +199,52 @@ def test_report_json_roundtrip(tmp_path):
assert loaded["total_comments"] == report["total_comments"] assert loaded["total_comments"] == report["total_comments"]
assert loaded["input_tokens"] == report["input_tokens"] assert loaded["input_tokens"] == report["input_tokens"]
assert loaded["gpt-4o-mini"]["jobs"] == report["gpt-4o-mini"]["jobs"] assert loaded["gpt-4o-mini"]["jobs"] == report["gpt-4o-mini"]["jobs"]
# ---------------------------------------------------------------------------
# count_input_tokens
def _make_job_input(tmp_path, comments, forum=None) -> Path:
"""Write a batch request JSONL in the same format as job1-input.jsonl."""
p = tmp_path / "job1-input.jsonl"
with open(p, "w", encoding="utf-8") as f:
for c in comments:
f.write(json.dumps(ab.build_batch_request_line(c, forum, "gpt-4o-mini")) + "\n")
return p
def test_count_input_tokens_matches_estimate(tmp_path):
"""count_input_tokens on a freshly written job file equals the sum estimate_tokens produces."""
p = _make_job_input(tmp_path, COMMENTS, FORUM_ITEM)
result = tk.count_input_tokens(p, "gpt-4o-mini")
expected = sum(
ab.estimate_tokens(ab.build_messages(c, FORUM_ITEM)[0], "gpt-4o-mini")
for c in COMMENTS
)
assert result["total_tokens"] == expected
assert result["total_requests"] == len(COMMENTS)
def test_count_input_tokens_fields(tmp_path):
p = _make_job_input(tmp_path, COMMENTS, FORUM_ITEM)
result = tk.count_input_tokens(p)
assert set(result.keys()) == {"total_tokens", "total_requests", "min", "max", "mean"}
assert result["min"] <= result["mean"] <= result["max"]
assert result["min"] > 0
def test_count_input_tokens_empty_file(tmp_path):
p = tmp_path / "empty.jsonl"
p.write_text("", encoding="utf-8")
result = tk.count_input_tokens(p)
assert result["total_tokens"] == 0
assert result["total_requests"] == 0
def test_count_input_tokens_includes_system_prompt(tmp_path):
"""Token count must be higher than user-message-only text length / 3 (prompt adds tokens)."""
p = _make_job_input(tmp_path, [COMMENT_A], FORUM_ITEM)
result = tk.count_input_tokens(p)
user_chars = len(COMMENT_A.get("text", ""))
# system prompt alone is hundreds of tokens; total must exceed naive user-text estimate
assert result["total_tokens"] > user_chars // 3