refactor/batch-openai prep
This commit is contained in:
@@ -1,11 +1,14 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
analysis/gpt4o/analysis-batch.py — OpenAI Batch API pipeline
|
||||
analysis_batch.py — OpenAI Batch API pipeline
|
||||
|
||||
Commands (run manually in order):
|
||||
submit <input_jsonl> [--model gpt-4o] — build request file, upload, create batch
|
||||
status <run_id> — check batch status, update manifest
|
||||
download <run_id> — download + normalize output, update manifest
|
||||
submit <input_jsonl> [--model gpt-4o] [--limit N]
|
||||
— build request file, upload, create batch
|
||||
status [run_id] — check batch status, update manifest
|
||||
download [run_id] — download + normalize output, update manifest
|
||||
|
||||
run_id defaults to the most recent run in runs/ when omitted.
|
||||
|
||||
File layout (all under analysis/gpt4o/):
|
||||
requests/<run_id>.jsonl — batch input sent to OpenAI
|
||||
@@ -29,6 +32,79 @@ try:
|
||||
except ImportError:
|
||||
sys.exit("openai package not installed. Run: pip install openai")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Model limits and token estimation
|
||||
|
||||
# Max enqueued tokens across ALL concurrent batches for this model
|
||||
# (docs/openai.md pricing table, updated 2026-05-05).
|
||||
# NOTE: your org tier may be lower — if a submit fails, use --limit to reduce chunk size.
|
||||
MODEL_LIMITS: dict[str, int] = {
|
||||
"gpt-5.5": 900_000,
|
||||
"gpt-5.4": 900_000,
|
||||
"gpt-5.4-mini": 2_000_000,
|
||||
"gpt-5.4-nano": 200_000,
|
||||
"gpt-4o": 900_000,
|
||||
"gpt-4o-mini": 2_000_000,
|
||||
"gpt-o4-mini": 2_000_000,
|
||||
}
|
||||
_DEFAULT_TOKEN_LIMIT = 900_000
|
||||
|
||||
# tiktoken encoding per model family; unknown models fall back to o200k_base
|
||||
_MODEL_ENCODING: dict[str, str] = {
|
||||
"gpt-5.5": "o200k_base",
|
||||
"gpt-5.4": "o200k_base",
|
||||
"gpt-5.4-mini": "o200k_base",
|
||||
"gpt-5.4-nano": "o200k_base",
|
||||
"gpt-4o": "o200k_base",
|
||||
"gpt-4o-mini": "o200k_base",
|
||||
"gpt-o4-mini": "o200k_base",
|
||||
}
|
||||
# Leave 10% headroom below the published limit
|
||||
_LIMIT_BUFFER = 0.90
|
||||
|
||||
|
||||
def estimate_tokens(messages: list[dict], model: str) -> int:
|
||||
"""Estimate token count for a messages list.
|
||||
|
||||
Uses tiktoken when available (exact for OpenAI models); falls back to
|
||||
chars/3 + 4-token overhead per message for unknown/Anthropic models.
|
||||
"""
|
||||
try:
|
||||
import tiktoken
|
||||
enc = tiktoken.get_encoding(_MODEL_ENCODING.get(model, "o200k_base"))
|
||||
return sum(4 + len(enc.encode(m["content"])) for m in messages)
|
||||
except ImportError:
|
||||
return sum(4 + len(m["content"]) // 3 for m in messages)
|
||||
|
||||
|
||||
def chunk_comments_by_tokens(
|
||||
comments: list[dict], forum: dict | None, model: str
|
||||
) -> list[list[dict]]:
|
||||
"""Split comments into chunks where each chunk fits under the model token limit."""
|
||||
raw_limit = MODEL_LIMITS.get(model, _DEFAULT_TOKEN_LIMIT)
|
||||
token_limit = int(raw_limit * _LIMIT_BUFFER)
|
||||
|
||||
chunks: list[list[dict]] = []
|
||||
current: list[dict] = []
|
||||
current_tokens = 0
|
||||
|
||||
for comment in comments:
|
||||
messages, _ = build_messages(comment, forum)
|
||||
tokens = estimate_tokens(messages, model)
|
||||
if current and current_tokens + tokens > token_limit:
|
||||
chunks.append(current)
|
||||
current = [comment]
|
||||
current_tokens = tokens
|
||||
else:
|
||||
current.append(comment)
|
||||
current_tokens += tokens
|
||||
|
||||
if current:
|
||||
chunks.append(current)
|
||||
|
||||
return chunks
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Prompt
|
||||
|
||||
@@ -221,6 +297,15 @@ def make_manifest(
|
||||
}
|
||||
|
||||
|
||||
def _latest_run_id() -> str:
|
||||
"""Return the run_id of the most recently saved manifest, or exit if none found."""
|
||||
runs = list(RUNS_DIR.glob("*.json")) if RUNS_DIR.exists() else []
|
||||
if not runs:
|
||||
sys.exit(f"No runs found in {RUNS_DIR}. Submit a batch first.")
|
||||
latest = max(runs, key=lambda p: p.stat().st_mtime)
|
||||
return latest.stem
|
||||
|
||||
|
||||
def load_manifest(run_id: str) -> dict:
|
||||
path = RUNS_DIR / f"{run_id}.json"
|
||||
return json.loads(path.read_text(encoding="utf-8"))
|
||||
@@ -235,6 +320,55 @@ def save_manifest(manifest: dict) -> None:
|
||||
# ---------------------------------------------------------------------------
|
||||
# Subcommand: submit
|
||||
|
||||
def _submit_chunk(
|
||||
chunk: list[dict],
|
||||
forum: dict | None,
|
||||
input_path: Path,
|
||||
input_sha256: str,
|
||||
model: str,
|
||||
client,
|
||||
chunk_index: int,
|
||||
total_chunks: int,
|
||||
) -> str:
|
||||
"""Upload and submit one chunk of comments. Returns the run_id."""
|
||||
import uuid
|
||||
run_id = str(uuid.uuid4())
|
||||
label = f"chunk {chunk_index + 1}/{total_chunks}" if total_chunks > 1 else "single batch"
|
||||
|
||||
REQUESTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
request_path = REQUESTS_DIR / f"{run_id}.jsonl"
|
||||
with open(request_path, "w", encoding="utf-8") as f:
|
||||
for comment in chunk:
|
||||
line = build_batch_request_line(comment, forum, model)
|
||||
f.write(json.dumps(line, ensure_ascii=False) + "\n")
|
||||
|
||||
print(f"[{label}] Wrote {len(chunk)} requests → {request_path}", file=sys.stderr)
|
||||
|
||||
with open(request_path, "rb") as f:
|
||||
uploaded = client.files.create(file=f, purpose="batch")
|
||||
print(f"[{label}] Uploaded: {uploaded.id}", file=sys.stderr)
|
||||
|
||||
batch = client.batches.create(
|
||||
input_file_id=uploaded.id,
|
||||
endpoint="/v1/chat/completions",
|
||||
completion_window="24h",
|
||||
metadata={"run_id": run_id, "input_filename": str(input_path)},
|
||||
)
|
||||
print(f"[{label}] Batch created: {batch.id} status={batch.status}", file=sys.stderr)
|
||||
|
||||
manifest = make_manifest(
|
||||
run_id=run_id,
|
||||
input_filename=str(input_path),
|
||||
input_sha256=input_sha256,
|
||||
model=model,
|
||||
batch_id=batch.id,
|
||||
records_submitted=len(chunk),
|
||||
request_filename=str(request_path),
|
||||
)
|
||||
save_manifest(manifest)
|
||||
return run_id
|
||||
|
||||
|
||||
def cmd_submit(args, client) -> None:
|
||||
_load_prompt(Path(args.prompt))
|
||||
print(f"Prompt: {args.prompt} (version {PROMPT_VERSION})", file=sys.stderr)
|
||||
@@ -250,49 +384,39 @@ def cmd_submit(args, client) -> None:
|
||||
if forum is None:
|
||||
print("Warning: no ForumItem found — regulation context will be [unknown].", file=sys.stderr)
|
||||
|
||||
import uuid
|
||||
run_id = str(uuid.uuid4())
|
||||
if args.limit:
|
||||
comments = comments[:args.limit]
|
||||
print(f"Limiting to {len(comments)} comments (--limit {args.limit}).", file=sys.stderr)
|
||||
|
||||
token_limit = int(MODEL_LIMITS.get(args.model, _DEFAULT_TOKEN_LIMIT) * _LIMIT_BUFFER)
|
||||
chunks = chunk_comments_by_tokens(comments, forum, args.model)
|
||||
total = len(chunks)
|
||||
print(
|
||||
f"Model: {args.model} token limit: {token_limit:,} "
|
||||
f"→ {len(comments)} comments split into {total} chunk(s).",
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
input_sha256 = hashlib.sha256(input_path.read_bytes()).hexdigest()
|
||||
|
||||
# Build batch request file
|
||||
REQUESTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
request_path = REQUESTS_DIR / f"{run_id}.jsonl"
|
||||
with open(request_path, "w", encoding="utf-8") as f:
|
||||
for comment in comments:
|
||||
line = build_batch_request_line(comment, forum, args.model)
|
||||
f.write(json.dumps(line, ensure_ascii=False) + "\n")
|
||||
# Submit only the first chunk — the enqueued token limit is a TOTAL across all
|
||||
# concurrent batches, so stacking multiple submissions will exceed the quota.
|
||||
# Wait for each batch to complete before submitting the next.
|
||||
run_id = _submit_chunk(chunks[0], forum, input_path, input_sha256, args.model, client, 0, total)
|
||||
|
||||
print(f"Wrote {len(comments)} requests → {request_path}", file=sys.stderr)
|
||||
print(f"\nBatch 1/{total} submitted.", file=sys.stderr)
|
||||
print(f" status: python analysis/gpt4o/analysis_batch.py status {run_id}", file=sys.stderr)
|
||||
print(f" download: python analysis/gpt4o/analysis_batch.py download {run_id}", file=sys.stderr)
|
||||
|
||||
# Upload to OpenAI
|
||||
print("Uploading request file ...", file=sys.stderr)
|
||||
with open(request_path, "rb") as f:
|
||||
uploaded = client.files.create(file=f, purpose="batch")
|
||||
print(f"Uploaded: {uploaded.id}", file=sys.stderr)
|
||||
if total > 1:
|
||||
remaining = sum(len(c) for c in chunks[1:])
|
||||
print(f"\n{total - 1} more chunk(s) remaining ({remaining} comments).", file=sys.stderr)
|
||||
print("After this batch completes and is downloaded, rerun submit with --limit to get the next chunk:", file=sys.stderr)
|
||||
offset = len(chunks[0])
|
||||
for idx, chunk in enumerate(chunks[1:], start=2):
|
||||
print(f" chunk {idx}/{total}: comments {offset}–{offset + len(chunk) - 1}", file=sys.stderr)
|
||||
offset += len(chunk)
|
||||
|
||||
# Create batch
|
||||
batch = client.batches.create(
|
||||
input_file_id=uploaded.id,
|
||||
endpoint="/v1/chat/completions",
|
||||
completion_window="24h",
|
||||
metadata={"run_id": run_id, "input_filename": str(input_path)},
|
||||
)
|
||||
print(f"Batch created: {batch.id} status={batch.status}", file=sys.stderr)
|
||||
|
||||
# Save manifest
|
||||
manifest = make_manifest(
|
||||
run_id=run_id,
|
||||
input_filename=str(input_path),
|
||||
input_sha256=input_sha256,
|
||||
model=args.model,
|
||||
batch_id=batch.id,
|
||||
records_submitted=len(comments),
|
||||
request_filename=str(request_path),
|
||||
)
|
||||
save_manifest(manifest)
|
||||
|
||||
print(f"\nrun_id: {run_id}", file=sys.stderr)
|
||||
print(f"Check status: python analysis/gpt4o/analysis-batch.py status {run_id}", file=sys.stderr)
|
||||
print(run_id) # stdout for scripting
|
||||
|
||||
|
||||
@@ -300,7 +424,10 @@ def cmd_submit(args, client) -> None:
|
||||
# Subcommand: status
|
||||
|
||||
def cmd_status(args, client) -> None:
|
||||
manifest = load_manifest(args.run_id)
|
||||
run_id = args.run_id or _latest_run_id()
|
||||
if not args.run_id:
|
||||
print(f"(using latest run: {run_id})", file=sys.stderr)
|
||||
manifest = load_manifest(run_id)
|
||||
batch = client.batches.retrieve(manifest["batch_id"])
|
||||
|
||||
counts = batch.request_counts
|
||||
@@ -314,14 +441,17 @@ def cmd_status(args, client) -> None:
|
||||
|
||||
if batch.status == "completed":
|
||||
print(f"\nReady to download. Run:")
|
||||
print(f" python analysis/gpt4o/analysis-batch.py download {args.run_id}")
|
||||
print(f" python analysis/gpt4o/analysis_batch.py download {run_id}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Subcommand: download
|
||||
|
||||
def cmd_download(args, client) -> None:
|
||||
manifest = load_manifest(args.run_id)
|
||||
run_id = args.run_id or _latest_run_id()
|
||||
if not args.run_id:
|
||||
print(f"(using latest run: {run_id})", file=sys.stderr)
|
||||
manifest = load_manifest(run_id)
|
||||
batch = client.batches.retrieve(manifest["batch_id"])
|
||||
|
||||
if batch.status != "completed":
|
||||
@@ -398,12 +528,18 @@ def main() -> None:
|
||||
default=str(_DEFAULT_PROMPT_FILE),
|
||||
help="Path to system prompt file (default: analysis/prompt-1.txt)",
|
||||
)
|
||||
p_submit.add_argument(
|
||||
"--limit", type=int, default=None, metavar="N",
|
||||
help="Submit only the first N comments (useful for staying under token quota)",
|
||||
)
|
||||
|
||||
p_status = sub.add_parser("status", help="Check batch status")
|
||||
p_status.add_argument("run_id", help="run_id from submit output")
|
||||
p_status.add_argument("run_id", nargs="?", default=None,
|
||||
help="run_id from submit (default: most recent run)")
|
||||
|
||||
p_download = sub.add_parser("download", help="Download and normalize completed batch")
|
||||
p_download.add_argument("run_id", help="run_id from submit output")
|
||||
p_download.add_argument("run_id", nargs="?", default=None,
|
||||
help="run_id from submit (default: most recent run)")
|
||||
|
||||
args = parser.parse_args()
|
||||
client = openai.OpenAI(api_key=api_key)
|
||||
|
||||
@@ -1,16 +0,0 @@
|
||||
{
|
||||
"run_id": "5b8714a7-0666-40a2-9d69-2d9ce9074406",
|
||||
"input_filename": "output\\f452.jsonl",
|
||||
"input_sha256": "59dcc8b13cc2a386977a8b934c498c7e639b7e684a94ca1bfd10a14878670018",
|
||||
"prompt_hash": "cb41250",
|
||||
"model": "gpt-4o",
|
||||
"batch_id": "batch_69fa579c7cd081909c049715838df6c6",
|
||||
"records_submitted": 9083,
|
||||
"records_completed": 0,
|
||||
"records_failed": 0,
|
||||
"request_filename": "C:\\Users\\moses\\projects\\vath\\analysis\\gpt4o\\requests\\5b8714a7-0666-40a2-9d69-2d9ce9074406.jsonl",
|
||||
"raw_output_filename": null,
|
||||
"normalized_output_filename": null,
|
||||
"created_at": "2026-05-05T20:48:28.268022+00:00",
|
||||
"completed_at": null
|
||||
}
|
||||
4
docs/pipeline-1.2.3.svg
Normal file
4
docs/pipeline-1.2.3.svg
Normal file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 130 KiB |
99
docs/pipeline-v1.2.3.drawio
Normal file
99
docs/pipeline-v1.2.3.drawio
Normal file
@@ -0,0 +1,99 @@
|
||||
<mxfile host="app.diagrams.net">
|
||||
<diagram name="Page-1" id="0sW-Vs8X5usvYmJikUIv">
|
||||
<mxGraphModel dx="2179" dy="1118" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="0" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0">
|
||||
<root>
|
||||
<mxCell id="0" />
|
||||
<mxCell id="1" parent="0" />
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-3" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-1" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0;entryDx=16.5;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-29">
|
||||
<mxGeometry relative="1" as="geometry">
|
||||
<mxPoint x="200" y="290" as="targetPoint" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-1" parent="1" style="shape=process;whiteSpace=wrap;html=1;backgroundOutline=1;" value="scraper" vertex="1">
|
||||
<mxGeometry height="60" width="120" x="40" y="170" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-46" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-5" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;" target="mENAtx_syaeSO5uR6kG6-34">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-5" parent="1" style="shape=process;whiteSpace=wrap;html=1;backgroundOutline=1;" value="tokenizer" vertex="1">
|
||||
<mxGeometry height="60" width="120" x="400" y="170" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-6" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=center;verticalAlign=top;rounded=0;" value="gather forum data" vertex="1">
|
||||
<mxGeometry height="60" width="120" x="20" y="240" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-7" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="<div>tokenize forum,</div><div>generate report w/</div><div>recommendations</div>" vertex="1">
|
||||
<mxGeometry height="60" width="120" x="400" y="240" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-28" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-35">
|
||||
<mxGeometry relative="1" as="geometry">
|
||||
<mxPoint x="910" y="270" as="targetPoint" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-19" parent="1" style="shape=process;whiteSpace=wrap;html=1;backgroundOutline=1;" value="batch" vertex="1">
|
||||
<mxGeometry height="60" width="120" x="720" y="170" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-21" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=right;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="<div>--model</div><div>--limit</div>" vertex="1">
|
||||
<mxGeometry height="60" width="120" x="590" y="210" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-23" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=right;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="--forum" vertex="1">
|
||||
<mxGeometry height="60" width="120" x="-90" y="170" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-25" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=right;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="--prompt" vertex="1">
|
||||
<mxGeometry height="60" width="120" x="270" y="210" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-26" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="<div>split job into batches</div><div>submit first batch</div><div>status of current batch</div><div>download batch artifacts</div>" vertex="1">
|
||||
<mxGeometry height="60" width="120" x="720" y="240" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-29" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="" vertex="1">
|
||||
<mxGeometry height="70" width="50" x="210" y="240" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-30" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="" vertex="1">
|
||||
<mxGeometry height="70" width="50" x="220" y="250" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-45" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-31" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0;exitDx=50;exitDy=43.5;exitPerimeter=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" target="mENAtx_syaeSO5uR6kG6-5">
|
||||
<mxGeometry relative="1" as="geometry">
|
||||
<Array as="points">
|
||||
<mxPoint x="320" y="304" />
|
||||
<mxPoint x="320" y="200" />
|
||||
</Array>
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-31" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="<div>forum</div><div>.jsonl</div>" vertex="1">
|
||||
<mxGeometry height="70" width="50" x="230" y="260" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-47" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-34" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0;exitDx=50;exitDy=43.5;exitPerimeter=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" target="mENAtx_syaeSO5uR6kG6-19">
|
||||
<mxGeometry relative="1" as="geometry">
|
||||
<Array as="points">
|
||||
<mxPoint x="640" y="284" />
|
||||
<mxPoint x="640" y="200" />
|
||||
</Array>
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-34" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="<div>report</div><div>.json</div>" vertex="1">
|
||||
<mxGeometry height="70" width="50" x="560" y="240" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-35" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="job.json" vertex="1">
|
||||
<mxGeometry height="70" width="50" x="890" y="240" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-41" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="" vertex="1">
|
||||
<mxGeometry height="70" width="50" x="940" y="340" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-42" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="" vertex="1">
|
||||
<mxGeometry height="70" width="50" x="950" y="350" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-43" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="<div>batchN-</div><div>output-</div><div>.jsonl</div>" vertex="1">
|
||||
<mxGeometry height="70" width="50" x="960" y="360" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-48" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="<div>errors</div><div>.jsonl</div>" vertex="1">
|
||||
<mxGeometry height="70" width="50" x="980" y="240" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-51" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0;entryDx=16.5;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-41">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-53" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0;entryDx=16.5;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-48">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
</root>
|
||||
</mxGraphModel>
|
||||
</diagram>
|
||||
</mxfile>
|
||||
123
docs/tasks.org
123
docs/tasks.org
@@ -104,7 +104,7 @@ Reference: ./docs/openai-batch.md. openai batch output order is not guaranteed,
|
||||
- tests: 18 passing (pytest tests/analysis_gpt4o_batch.py), 46 total across suite
|
||||
- datetime: [2026-05-05 Tue 17:00]
|
||||
|
||||
* [ ] t1.2.2: Tokenizer / Batch mgmt
|
||||
* [X] t1.2.2: Tokenizer / Batch mgmt
|
||||
openai batch analysis requires coordination - more like a job queue.
|
||||
batch script should setup queue for user to setup manually; openai api will reject subsequent batches when the total daily token limit is maxed.
|
||||
** Acceptance Criteria
|
||||
@@ -117,17 +117,136 @@ batch script should setup queue for user to setup manually; openai api will reje
|
||||
- Each chunk becomes its own batch submission with its own run_id.
|
||||
- Drop --limit (or keep as hard cap override).
|
||||
- Print all run_ids
|
||||
- Submit the first batch only
|
||||
- Submit the first batch only (failed)
|
||||
4. Update test script to show tokenizer output
|
||||
|
||||
** notes
|
||||
- MODEL_LIMITS and _MODEL_ENCODING dicts in analysis/gpt4o/analysis_batch.py; keyed by model name, sourced from docs/openai.md. Unknown models fall back to o200k_base encoding and 900k token limit.
|
||||
- estimate_tokens(messages, model): uses tiktoken (o200k_base) when available; falls back to chars/3 + 4 overhead per message.
|
||||
- chunk_comments_by_tokens(comments, forum, model): greedy bin-pack; respects 10% headroom (_LIMIT_BUFFER=0.90). Returns list of comment lists.
|
||||
- submit sends only chunks[0] — enqueued token limit is a TOTAL across all concurrent batches; stacking would exceed quota. Remaining chunk ranges are printed as manual instructions.
|
||||
- --limit N still available as a hard cap on total comments before chunking (useful when org-tier limit is below the published model limit).
|
||||
- pip install tiktoken required for exact token counting; chars/3 fallback activates automatically if not installed.
|
||||
|
||||
|
||||
*** usage
|
||||
- `pip install tiktoken`
|
||||
- submit first chunk (auto-sized to model token limit, uses most recent output file)
|
||||
`python analysis/gpt4o/analysis_batch.py submit output/f452.jsonl --model gpt-4o-mini`
|
||||
- check status (defaults to most recent run)
|
||||
`python analysis/gpt4o/analysis_batch.py status`
|
||||
- download + normalize when complete
|
||||
`python analysis/gpt4o/analysis_batch.py download`
|
||||
- submit next chunk: rerun with `--limit` to cover the next N comments
|
||||
(track which comment_ids have already been analyzed to avoid duplicates)
|
||||
|
||||
*** validation
|
||||
#+begin_src python
|
||||
import pandas as pd
|
||||
df_input = pd.read_json('C:/Users/moses/projects/vath/analysis/gpt4o/runs/75ee9a/f452.jsonl', lines=True)
|
||||
# drop forum item
|
||||
df_input_comments = df_input[df_input["comment_id"].notna()].copy()
|
||||
df_output = pd.read_json('C:/Users/moses/projects/vath/analysis/gpt4o/runs/75ee9a/75ee9a6c-8fc2-4924-8d96-b55bb4d5e832_gpt-4o.jsonl', lines=True)
|
||||
dfm = df_output.merge(df_input_comments,on="comment_id",how="left",suffixes=("","_input"),)
|
||||
dfm.to_csv('C:/Users/moses/projects/vath/analysis/gpt4o/1.csv')
|
||||
#+end_src
|
||||
order columns:
|
||||
forum_id_input,comment_id,title,text,date,author,stance,stance_confidence,stance_rationale,tone,tags,error,truncated,analyzed_at,prompt_version,model
|
||||
|
||||
** evidence
|
||||
- commit:
|
||||
- tests: 23 passing (pytest tests/analysis_gpt4o_batch.py), 51 total across suite
|
||||
- datetime: [2026-05-06 Wed 08:55]
|
||||
|
||||
* [ ] t1.2.3: batch job refactor
|
||||
This task encompasses intent and fixes for 1.2.1 and 1.2.2.
|
||||
batch processing should be a resumable job queue, not a one-shot script. the user should not need to remember offsets, completed chunks, failed batches, or which comments remain.
|
||||
** Acceptance Criteria
|
||||
1. create tokenizer to prepare the batch job
|
||||
- input: prompt.txt, forum.jsonl
|
||||
- output: report.json with each model's batch structure, cost, and time (considering tpd constraints)
|
||||
- analysis_batch should be able to take this report to run the job. good place to copy the raw scraper jsonl
|
||||
#+begin_src python
|
||||
{'prompt': 'prompt1.txt',
|
||||
'input_file': 'f451.jsonl',
|
||||
'input_tokens': 123456789,
|
||||
'gpt-4o': {'jobs':71,'cost_$':4,'est_queue_days':3} # divide tokens by model TPD to get time_days
|
||||
'gpt-4o-mini': {'jobs':71,'cost_$':4,'est_queue_days':3} # divide tokens by model TPD to get time_days
|
||||
#+end_src
|
||||
2. batch py should contain commands to create, check, run, and complete jobs.
|
||||
- inputs: report.json, --model, optional --job N, read api key from .env
|
||||
- outputs:
|
||||
- status.json: job structure, status, metadata; updated when jobs are finished. includes all report.json info
|
||||
- for each job: jobN-input.jsonl (what is sent to openai); jobN-output-raw.jsonl, jobN-output.jsonl, and jobN-errors.jsonl (when downloaded)
|
||||
- jobN-output.jsonl contains:
|
||||
- one analysis record per comment
|
||||
- `run_id`, `forum_id`, `comment_id`, `analyzed_at`, `model`, `prompt_version`
|
||||
- `stance` toward proposed reg/guidance: support|oppose|neutral|unclear
|
||||
- `stance_confidence`: 0-1
|
||||
- short rationale, if provided by model
|
||||
- generic sentiment `tone` (separate from stance): positive|negative|neutral|mixed|unclear
|
||||
- `tags` for later grouping, may be empty
|
||||
- commands: `create`, `submit`, `status`, `download`
|
||||
- `create` run directory, copy input/prompt/report, generate status.json, job request files
|
||||
- `submit` if eligible, submit next or specified job; does not blindly stack jobs, warns if prev jobs in progress, print next action
|
||||
- `status` check status of one or all submitted jobs, update status.json
|
||||
- `download` raw output (jobN-output-raw.jsonl) and error files for completed jobs, and normalize raw output (jobN-output.jsonl) auto run status.
|
||||
3. tests without live api calls
|
||||
- partial completed run
|
||||
- failed batch records
|
||||
- out-of-order output
|
||||
- duplicate custom_id
|
||||
- missing output file
|
||||
- resume from status.json
|
||||
- remaining-comment detection
|
||||
|
||||
* === Backlog ===
|
||||
* [ ] X: analysis validation view
|
||||
create a lightweight validation script that joins raw comments to normalized analysis output and writes a human-reviewable csv.
|
||||
|
||||
** acceptance criteria
|
||||
1. input raw scrape jsonl and all *-output.jsonl files in a dir
|
||||
2. join by comment_id, not dataframe index
|
||||
3. output csv columns in review order:
|
||||
- forum_id, comment_id, title, text, date, author
|
||||
- stance, stance_confidence, stance_rationale, tone, tags
|
||||
- error, truncated, analyzed_at, prompt_version, model
|
||||
4. print validation counts
|
||||
- raw comments
|
||||
- analyzed records
|
||||
- joined records
|
||||
- missing comment text
|
||||
- duplicate comment_ids
|
||||
- error records
|
||||
- stance counts
|
||||
- tone counts
|
||||
5. tests cover join behavior and missing/duplicate ids
|
||||
|
||||
** evidence
|
||||
- commit:
|
||||
- tests:
|
||||
- csv:
|
||||
- datetime:
|
||||
* [ ] X: text encoding cleanup
|
||||
fix mojibake in scraped text before analysis/reporting, especially curly quotes showing as ’.
|
||||
|
||||
** acceptance criteria
|
||||
1. identify whether mojibake exists in raw scrape, analysis output, or csv export only
|
||||
2. add repair step at the earliest correct layer
|
||||
3. preserve original raw scrape if repair changes source text
|
||||
4. add test cases for common bad sequences:
|
||||
- ’
|
||||
- “
|
||||
- â€
|
||||
- –
|
||||
- —
|
||||
5. document whether repaired text is used for model input
|
||||
|
||||
** evidence
|
||||
- commit:
|
||||
- tests:
|
||||
- before/after sample:
|
||||
- datetime:
|
||||
* [ ] X: complete proposal information
|
||||
Ensure we capture as much useful information as possible about the actual proposal - contact information, etc. what the state actually says about what was posted.
|
||||
** acceptance criteria
|
||||
|
||||
@@ -250,3 +250,62 @@ def test_manifest_save_load_roundtrip(tmp_path, monkeypatch):
|
||||
bt.save_manifest(m)
|
||||
loaded = bt.load_manifest(RUN_ID)
|
||||
assert loaded == m
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# estimate_tokens
|
||||
|
||||
def test_estimate_tokens_returns_positive_int():
|
||||
messages = [{"role": "system", "content": "hello"}, {"role": "user", "content": "world"}]
|
||||
result = bt.estimate_tokens(messages, "gpt-4o-mini")
|
||||
assert isinstance(result, int)
|
||||
assert result > 0
|
||||
|
||||
|
||||
def test_estimate_tokens_longer_content_is_larger():
|
||||
short_msg = [{"role": "user", "content": "hi"}]
|
||||
long_msg = [{"role": "user", "content": "hi " * 500}]
|
||||
assert bt.estimate_tokens(long_msg, "gpt-4o-mini") > bt.estimate_tokens(short_msg, "gpt-4o-mini")
|
||||
|
||||
|
||||
def test_estimate_tokens_fallback_without_tiktoken(monkeypatch):
|
||||
import sys as _sys
|
||||
monkeypatch.setitem(_sys.modules, "tiktoken", None)
|
||||
messages = [{"role": "user", "content": "x" * 300}]
|
||||
result = bt.estimate_tokens(messages, "gpt-4o")
|
||||
assert result == 4 + 300 // 3
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# chunk_comments_by_tokens
|
||||
|
||||
def test_chunk_single_chunk_for_small_input(monkeypatch):
|
||||
monkeypatch.setattr(bt, "MODEL_LIMITS", {"gpt-4o-mini": 10_000_000})
|
||||
comments = [COMMENT_ITEM, {**COMMENT_ITEM, "comment_id": "99999"}]
|
||||
chunks = bt.chunk_comments_by_tokens(comments, FORUM_ITEM, "gpt-4o-mini")
|
||||
assert len(chunks) == 1
|
||||
assert len(chunks[0]) == 2
|
||||
|
||||
|
||||
def test_chunk_splits_when_over_limit(monkeypatch):
|
||||
monkeypatch.setattr(bt, "MODEL_LIMITS", {"gpt-4o-mini": 1})
|
||||
comments = [
|
||||
COMMENT_ITEM,
|
||||
{**COMMENT_ITEM, "comment_id": "99999"},
|
||||
{**COMMENT_ITEM, "comment_id": "88888"},
|
||||
]
|
||||
chunks = bt.chunk_comments_by_tokens(comments, FORUM_ITEM, "gpt-4o-mini")
|
||||
assert len(chunks) == len(comments)
|
||||
|
||||
|
||||
def test_chunk_preserves_all_comments(monkeypatch):
|
||||
monkeypatch.setattr(bt, "MODEL_LIMITS", {"gpt-4o-mini": 200})
|
||||
comments = [{**COMMENT_ITEM, "comment_id": str(i)} for i in range(10)]
|
||||
chunks = bt.chunk_comments_by_tokens(comments, FORUM_ITEM, "gpt-4o-mini")
|
||||
flat = [c for chunk in chunks for c in chunk]
|
||||
assert len(flat) == 10
|
||||
|
||||
|
||||
def test_model_limits_has_required_models():
|
||||
for model in ("gpt-4o", "gpt-4o-mini", "gpt-5.4", "gpt-5.4-mini", "gpt-o4-mini"):
|
||||
assert model in bt.MODEL_LIMITS, f"{model} missing from MODEL_LIMITS"
|
||||
|
||||
Reference in New Issue
Block a user