completed openai batch work

This commit is contained in:
2026-05-07 07:24:11 -04:00
parent 64a7a18721
commit f5d679808e
29 changed files with 36711 additions and 83 deletions

File diff suppressed because one or more lines are too long

Before

Width:  |  Height:  |  Size: 130 KiB

View File

@@ -1,9 +1,18 @@
<mxfile host="app.diagrams.net">
<diagram name="Page-1" id="0sW-Vs8X5usvYmJikUIv">
<mxGraphModel dx="2179" dy="1118" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="0" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0">
<mxGraphModel dx="1315" dy="798" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="0" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0">
<root>
<mxCell id="0" />
<mxCell id="1" parent="0" />
<mxCell id="mENAtx_syaeSO5uR6kG6-61" parent="1" style="rounded=0;whiteSpace=wrap;html=1;" value="" vertex="1">
<mxGeometry height="90" width="190" x="1000" y="330" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-60" parent="1" style="rounded=0;whiteSpace=wrap;html=1;" value="" vertex="1">
<mxGeometry height="90" width="190" x="1010" y="340" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-59" parent="1" style="rounded=0;whiteSpace=wrap;html=1;" value="" vertex="1">
<mxGeometry height="90" width="190" x="1020" y="350" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-3" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-1" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0;entryDx=16.5;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-29">
<mxGeometry relative="1" as="geometry">
<mxPoint x="200" y="290" as="targetPoint" />
@@ -18,18 +27,18 @@
<mxCell id="mENAtx_syaeSO5uR6kG6-5" parent="1" style="shape=process;whiteSpace=wrap;html=1;backgroundOutline=1;" value="tokenizer" vertex="1">
<mxGeometry height="60" width="120" x="400" y="170" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-6" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=center;verticalAlign=top;rounded=0;" value="gather forum data" vertex="1">
<mxGeometry height="60" width="120" x="20" y="240" as="geometry" />
<mxCell id="mENAtx_syaeSO5uR6kG6-6" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="&lt;div align=&quot;left&quot;&gt;- collect forum data&lt;/div&gt;" vertex="1">
<mxGeometry height="60" width="120" x="40" y="240" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-7" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="&lt;div&gt;tokenize forum,&lt;/div&gt;&lt;div&gt;generate report w/&lt;/div&gt;&lt;div&gt;recommendations&lt;/div&gt;" vertex="1">
<mxCell id="mENAtx_syaeSO5uR6kG6-7" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="&lt;div&gt;- tokenize forum&lt;/div&gt;&lt;div&gt;- generate report w/&lt;/div&gt;&lt;div&gt;recommendations&lt;/div&gt;" vertex="1">
<mxGeometry height="60" width="120" x="400" y="240" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-28" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-35">
<mxCell id="mENAtx_syaeSO5uR6kG6-28" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" target="mENAtx_syaeSO5uR6kG6-73">
<mxGeometry relative="1" as="geometry">
<mxPoint x="910" y="270" as="targetPoint" />
<mxPoint x="953" y="240" as="targetPoint" />
</mxGeometry>
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-19" parent="1" style="shape=process;whiteSpace=wrap;html=1;backgroundOutline=1;" value="batch" vertex="1">
<mxCell id="mENAtx_syaeSO5uR6kG6-19" parent="1" style="shape=process;whiteSpace=wrap;html=1;backgroundOutline=1;" value="openai_batch" vertex="1">
<mxGeometry height="60" width="120" x="720" y="170" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-21" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=right;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="&lt;div&gt;--model&lt;/div&gt;&lt;div&gt;--limit&lt;/div&gt;" vertex="1">
@@ -38,11 +47,8 @@
<mxCell id="mENAtx_syaeSO5uR6kG6-23" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=right;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="--forum" vertex="1">
<mxGeometry height="60" width="120" x="-90" y="170" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-25" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=right;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="--prompt" vertex="1">
<mxGeometry height="60" width="120" x="270" y="210" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-26" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="&lt;div&gt;split job into batches&lt;/div&gt;&lt;div&gt;submit first batch&lt;/div&gt;&lt;div&gt;status of current batch&lt;/div&gt;&lt;div&gt;download batch artifacts&lt;/div&gt;" vertex="1">
<mxGeometry height="60" width="120" x="720" y="240" as="geometry" />
<mxCell id="mENAtx_syaeSO5uR6kG6-26" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="&lt;div&gt;- split job into batches&lt;/div&gt;&lt;div&gt;- submit first batch&lt;/div&gt;&lt;div&gt;- status of current batch&lt;/div&gt;&lt;div&gt;- download batch artifacts&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="140" x="720" y="240" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-29" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="" vertex="1">
<mxGeometry height="70" width="50" x="210" y="240" as="geometry" />
@@ -58,7 +64,7 @@
</Array>
</mxGeometry>
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-31" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;forum&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1">
<mxCell id="mENAtx_syaeSO5uR6kG6-31" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;&amp;lt;forumid&amp;gt;&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="230" y="260" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-47" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-34" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0;exitDx=50;exitDy=43.5;exitPerimeter=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" target="mENAtx_syaeSO5uR6kG6-19">
@@ -69,30 +75,42 @@
</Array>
</mxGeometry>
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-34" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;report&lt;/div&gt;&lt;div&gt;.json&lt;/div&gt;" vertex="1">
<mxCell id="mENAtx_syaeSO5uR6kG6-34" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;&lt;br&gt;&lt;/div&gt;&lt;div&gt;&amp;lt;forumid&amp;gt;&lt;br&gt;-report&lt;/div&gt;&lt;div&gt;.json&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="560" y="240" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-35" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="job.json" vertex="1">
<mxGeometry height="70" width="50" x="890" y="240" as="geometry" />
<mxCell id="mENAtx_syaeSO5uR6kG6-35" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;status&lt;/div&gt;&lt;div&gt;.json&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="913.25" y="360" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-41" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="" vertex="1">
<mxGeometry height="70" width="50" x="940" y="340" as="geometry" />
<mxCell id="mENAtx_syaeSO5uR6kG6-43" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;jobN-&lt;/div&gt;&lt;div&gt;output&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="1090" y="360" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-42" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="" vertex="1">
<mxGeometry height="70" width="50" x="950" y="350" as="geometry" />
<mxCell id="mENAtx_syaeSO5uR6kG6-48" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;jobN-errors&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="1150" y="360" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-43" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;batchN-&lt;/div&gt;&lt;div&gt;output-&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="960" y="360" as="geometry" />
<mxCell id="mENAtx_syaeSO5uR6kG6-54" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;jobN-&lt;/div&gt;&lt;div&gt;input&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="1030" y="360" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-48" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;errors&lt;/div&gt;&lt;div&gt;.jsonl&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="980" y="240" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-51" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0;entryDx=16.5;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-41">
<mxCell id="mENAtx_syaeSO5uR6kG6-64" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-63" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0;exitDx=50;exitDy=43.5;exitPerimeter=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" target="mENAtx_syaeSO5uR6kG6-5">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-53" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0;entryDx=16.5;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-48">
<mxCell id="mENAtx_syaeSO5uR6kG6-63" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="&lt;div&gt;prompt&lt;/div&gt;&lt;div&gt;.txt&lt;/div&gt;" vertex="1">
<mxGeometry height="70" width="50" x="270" y="90" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-67" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="create" vertex="1">
<mxGeometry height="20" width="120" x="850" y="170" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-71" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="&lt;div&gt;submit&lt;/div&gt;&lt;div&gt;&lt;br&gt;&lt;/div&gt;&lt;div&gt;status&lt;/div&gt;&lt;div&gt;download&lt;/div&gt;" vertex="1">
<mxGeometry height="60" width="120" x="1020" y="240" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-75" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-73" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;" target="mENAtx_syaeSO5uR6kG6-35">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-76" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-73" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" target="mENAtx_syaeSO5uR6kG6-61">
<mxGeometry relative="1" as="geometry" />
</mxCell>
<mxCell id="mENAtx_syaeSO5uR6kG6-73" parent="1" style="image;aspect=fixed;perimeter=ellipsePerimeter;html=1;align=center;shadow=0;dashed=0;spacingTop=3;image=img/lib/active_directory/folder.svg;" value="&amp;lt;forumid&amp;gt;-N" vertex="1">
<mxGeometry height="50" width="36.5" x="920" y="240" as="geometry" />
</mxCell>
</root>
</mxGraphModel>
</diagram>

4
docs/pipeline-v1.2.3.svg Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 170 KiB

View File

@@ -201,44 +201,48 @@ batch processing should be a resumable job queue, not a one-shot script. the us
- remaining-comment detection
** notes
- analysis/gpt4o/tokenizer.py: new standalone script; imports analysis_batch for MODEL_LIMITS, estimate_tokens, build_messages. Reads input JSONL + prompt, computes per-model jobs/cost/time table, writes report.json to input file's directory. MODEL_PRICING dict lives here (not in analysis_batch).
- analysis/gpt4o/analysis_batch.py: fully rewritten with four subcommands: create, submit, status, download. No longer uses REQUESTS_DIR / RAW_DIR / RUNS_DIR.
- Job directories: analysis/gpt4o/jobs/<stem[:8]>-N/ (e.g. f452-1). Each run is self-contained: forum.jsonl, prompt.txt, report.json, jobN-input.jsonl, jobN-output-raw.jsonl, jobN-output.jsonl, jobN-errors.jsonl.
- analysis/tokenizer.py: new standalone script; imports openai_batch for MODEL_LIMITS, estimate_tokens, build_messages. Reads input JSONL + prompt, computes per-model jobs/cost/time table, writes reports/<stem>-report.json. MODEL_PRICING dict lives here (not in openai_batch). Pass a jobN-input.jsonl to count actual tokens instead.
- analysis/openai_batch.py: fully rewritten with four subcommands: create, submit, status, download. Job dirs at analysis/jobs/<stem[:8]>-N/.
- Job directories: analysis/jobs/<stem[:8]>-N/ (e.g. f452-1). Each run is self-contained: forum.jsonl, prompt.txt, report.json, jobN-input.jsonl, jobN-output-raw.jsonl, jobN-output.jsonl, jobN-errors.jsonl.
- status.json: tracks all jobs with pending/submitted/in_progress/completed/failed states. Updated by submit, status, download.
- _find_next_eligible_job: pure function for testability. Returns (next_pending_job, None) or (None, warning). Blocks submission if previous job is in_progress/submitted.
- create: no API key required. Reads report.json, re-chunks comments, writes all jobN-input.jsonl files, writes status.json.
- submit: uploads jobN-input.jsonl to Files API, creates batch, updates status.json to 'submitted'. Will not stack batches.
- status: retrieves batch from OpenAI, updates status.json counts and status.
- download: auto-runs status first, downloads output_file_id → jobN-output-raw.jsonl, error_file_id → jobN-errors.jsonl, normalizes → jobN-output.jsonl. Updates status.json.
- tests/test_tokenizer.py: 15 tests for compute_report schema, cost/time calculation, MODEL_PRICING coverage, print_table output, report.json round-trip.
- tests/tokenizer.py: 19 tests for compute_report schema, cost/time calculation, MODEL_PRICING coverage, print_table output, count_input_tokens, report.json round-trip.
- Token limit buffer: _LIMIT_BUFFER=0.80 (20% headroom). Estimate uses OpenAI cookbook chat formula (role tokens + 3-token reply primer). Verify a job file with: python analysis/tokenizer.py analysis/jobs/<dir>/jobN-input.jsonl
*** usage
#+begin_src sh
#+begin_src powershell
# 1. estimate tokens and cost
python analysis/gpt4o/tokenizer.py output/f452.jsonl --prompt analysis/prompt-1.txt
# writes output/report.json
python analysis/tokenizer.py output/f452.jsonl --prompt analysis/prompt-1.txt
# writes reports/f452-report.json
# 2. create job directory (no api key needed)
python analysis/gpt4o/analysis_batch.py create output/report.json --model gpt-4o-mini
# creates analysis/gpt4o/jobs/f452-1/
# 2. verify actual tokens in a job file (optional sanity check)
python analysis/tokenizer.py analysis/jobs/f452-1/job1-input.jsonl
# 3. submit first job
python analysis/gpt4o/analysis_batch.py submit
# 3. create job directory (no api key needed)
python analysis/openai_batch.py create reports/f452-report.json --model gpt-5.4-mini
# creates analysis/jobs/f452-1/
# 4. check status (repeat until completed)
python analysis/gpt4o/analysis_batch.py status
# 4. submit first job
python analysis/openai_batch.py submit
# 5. download and normalize
python analysis/gpt4o/analysis_batch.py download
# 5. check status (repeat until completed)
python analysis/openai_batch.py status
# 6. submit next job (if multi-job run), then repeat 4-5
python analysis/gpt4o/analysis_batch.py submit
# 6. download and normalize
python analysis/openai_batch.py download
# 7. submit next job (if multi-job run), then repeat 5-6
python analysis/openai_batch.py submit
#+end_src
** evidence
- commit:
- tests: passing (pytest tests/analysis_gpt4o_batch.py tests/test_tokenizer.py)
- datetime: [2026-05-05 Tue]
- tests: passing (pytest tests/openai_batch.py tests/openai_realtime.py tests/tokenizer.py)
- datetime: [2026-05-06 Wed]
* === Backlog ===
* [ ] X: analysis validation view