completed openai batch work
This commit is contained in:
File diff suppressed because one or more lines are too long
|
Before Width: | Height: | Size: 130 KiB |
@@ -1,9 +1,18 @@
|
||||
<mxfile host="app.diagrams.net">
|
||||
<diagram name="Page-1" id="0sW-Vs8X5usvYmJikUIv">
|
||||
<mxGraphModel dx="2179" dy="1118" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="0" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0">
|
||||
<mxGraphModel dx="1315" dy="798" grid="1" gridSize="10" guides="1" tooltips="1" connect="1" arrows="1" fold="1" page="0" pageScale="1" pageWidth="850" pageHeight="1100" math="0" shadow="0">
|
||||
<root>
|
||||
<mxCell id="0" />
|
||||
<mxCell id="1" parent="0" />
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-61" parent="1" style="rounded=0;whiteSpace=wrap;html=1;" value="" vertex="1">
|
||||
<mxGeometry height="90" width="190" x="1000" y="330" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-60" parent="1" style="rounded=0;whiteSpace=wrap;html=1;" value="" vertex="1">
|
||||
<mxGeometry height="90" width="190" x="1010" y="340" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-59" parent="1" style="rounded=0;whiteSpace=wrap;html=1;" value="" vertex="1">
|
||||
<mxGeometry height="90" width="190" x="1020" y="350" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-3" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-1" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0;entryDx=16.5;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-29">
|
||||
<mxGeometry relative="1" as="geometry">
|
||||
<mxPoint x="200" y="290" as="targetPoint" />
|
||||
@@ -18,18 +27,18 @@
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-5" parent="1" style="shape=process;whiteSpace=wrap;html=1;backgroundOutline=1;" value="tokenizer" vertex="1">
|
||||
<mxGeometry height="60" width="120" x="400" y="170" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-6" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=center;verticalAlign=top;rounded=0;" value="gather forum data" vertex="1">
|
||||
<mxGeometry height="60" width="120" x="20" y="240" as="geometry" />
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-6" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="<div align="left">- collect forum data</div>" vertex="1">
|
||||
<mxGeometry height="60" width="120" x="40" y="240" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-7" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="<div>tokenize forum,</div><div>generate report w/</div><div>recommendations</div>" vertex="1">
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-7" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="<div>- tokenize forum</div><div>- generate report w/</div><div>recommendations</div>" vertex="1">
|
||||
<mxGeometry height="60" width="120" x="400" y="240" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-28" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-35">
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-28" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" target="mENAtx_syaeSO5uR6kG6-73">
|
||||
<mxGeometry relative="1" as="geometry">
|
||||
<mxPoint x="910" y="270" as="targetPoint" />
|
||||
<mxPoint x="953" y="240" as="targetPoint" />
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-19" parent="1" style="shape=process;whiteSpace=wrap;html=1;backgroundOutline=1;" value="batch" vertex="1">
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-19" parent="1" style="shape=process;whiteSpace=wrap;html=1;backgroundOutline=1;" value="openai_batch" vertex="1">
|
||||
<mxGeometry height="60" width="120" x="720" y="170" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-21" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=right;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="<div>--model</div><div>--limit</div>" vertex="1">
|
||||
@@ -38,11 +47,8 @@
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-23" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=right;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="--forum" vertex="1">
|
||||
<mxGeometry height="60" width="120" x="-90" y="170" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-25" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=right;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="--prompt" vertex="1">
|
||||
<mxGeometry height="60" width="120" x="270" y="210" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-26" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="<div>split job into batches</div><div>submit first batch</div><div>status of current batch</div><div>download batch artifacts</div>" vertex="1">
|
||||
<mxGeometry height="60" width="120" x="720" y="240" as="geometry" />
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-26" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;" value="<div>- split job into batches</div><div>- submit first batch</div><div>- status of current batch</div><div>- download batch artifacts</div>" vertex="1">
|
||||
<mxGeometry height="70" width="140" x="720" y="240" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-29" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="" vertex="1">
|
||||
<mxGeometry height="70" width="50" x="210" y="240" as="geometry" />
|
||||
@@ -58,7 +64,7 @@
|
||||
</Array>
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-31" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="<div>forum</div><div>.jsonl</div>" vertex="1">
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-31" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="<div>&lt;forumid&gt;</div><div>.jsonl</div>" vertex="1">
|
||||
<mxGeometry height="70" width="50" x="230" y="260" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-47" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-34" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0;exitDx=50;exitDy=43.5;exitPerimeter=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" target="mENAtx_syaeSO5uR6kG6-19">
|
||||
@@ -69,30 +75,42 @@
|
||||
</Array>
|
||||
</mxGeometry>
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-34" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="<div>report</div><div>.json</div>" vertex="1">
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-34" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="<div><br></div><div>&lt;forumid&gt;<br>-report</div><div>.json</div>" vertex="1">
|
||||
<mxGeometry height="70" width="50" x="560" y="240" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-35" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="job.json" vertex="1">
|
||||
<mxGeometry height="70" width="50" x="890" y="240" as="geometry" />
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-35" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="<div>status</div><div>.json</div>" vertex="1">
|
||||
<mxGeometry height="70" width="50" x="913.25" y="360" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-41" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="" vertex="1">
|
||||
<mxGeometry height="70" width="50" x="940" y="340" as="geometry" />
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-43" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="<div>jobN-</div><div>output</div><div>.jsonl</div>" vertex="1">
|
||||
<mxGeometry height="70" width="50" x="1090" y="360" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-42" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="" vertex="1">
|
||||
<mxGeometry height="70" width="50" x="950" y="350" as="geometry" />
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-48" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="<div>jobN-errors</div><div>.jsonl</div>" vertex="1">
|
||||
<mxGeometry height="70" width="50" x="1150" y="360" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-43" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="<div>batchN-</div><div>output-</div><div>.jsonl</div>" vertex="1">
|
||||
<mxGeometry height="70" width="50" x="960" y="360" as="geometry" />
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-54" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="<div>jobN-</div><div>input</div><div>.jsonl</div>" vertex="1">
|
||||
<mxGeometry height="70" width="50" x="1030" y="360" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-48" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="<div>errors</div><div>.jsonl</div>" vertex="1">
|
||||
<mxGeometry height="70" width="50" x="980" y="240" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-51" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0;entryDx=16.5;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-41">
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-64" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-63" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=0;exitY=0;exitDx=50;exitDy=43.5;exitPerimeter=0;entryX=0;entryY=0.5;entryDx=0;entryDy=0;" target="mENAtx_syaeSO5uR6kG6-5">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-53" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-19" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;exitX=1;exitY=0.5;exitDx=0;exitDy=0;entryX=0;entryY=0;entryDx=16.5;entryDy=0;entryPerimeter=0;" target="mENAtx_syaeSO5uR6kG6-48">
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-63" parent="1" style="shape=note;whiteSpace=wrap;html=1;backgroundOutline=1;darkOpacity=0.05;size=17;" value="<div>prompt</div><div>.txt</div>" vertex="1">
|
||||
<mxGeometry height="70" width="50" x="270" y="90" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-67" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="create" vertex="1">
|
||||
<mxGeometry height="20" width="120" x="850" y="170" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-71" parent="1" style="text;html=1;whiteSpace=wrap;strokeColor=none;fillColor=none;align=left;verticalAlign=top;rounded=0;fontFamily=Courier New;" value="<div>submit</div><div><br></div><div>status</div><div>download</div>" vertex="1">
|
||||
<mxGeometry height="60" width="120" x="1020" y="240" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-75" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-73" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;" target="mENAtx_syaeSO5uR6kG6-35">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-76" edge="1" parent="1" source="mENAtx_syaeSO5uR6kG6-73" style="edgeStyle=orthogonalEdgeStyle;rounded=0;orthogonalLoop=1;jettySize=auto;html=1;entryX=0.5;entryY=0;entryDx=0;entryDy=0;" target="mENAtx_syaeSO5uR6kG6-61">
|
||||
<mxGeometry relative="1" as="geometry" />
|
||||
</mxCell>
|
||||
<mxCell id="mENAtx_syaeSO5uR6kG6-73" parent="1" style="image;aspect=fixed;perimeter=ellipsePerimeter;html=1;align=center;shadow=0;dashed=0;spacingTop=3;image=img/lib/active_directory/folder.svg;" value="&lt;forumid&gt;-N" vertex="1">
|
||||
<mxGeometry height="50" width="36.5" x="920" y="240" as="geometry" />
|
||||
</mxCell>
|
||||
</root>
|
||||
</mxGraphModel>
|
||||
</diagram>
|
||||
|
||||
4
docs/pipeline-v1.2.3.svg
Normal file
4
docs/pipeline-v1.2.3.svg
Normal file
File diff suppressed because one or more lines are too long
|
After Width: | Height: | Size: 170 KiB |
@@ -201,44 +201,48 @@ batch processing should be a resumable job queue, not a one-shot script. the us
|
||||
- remaining-comment detection
|
||||
|
||||
** notes
|
||||
- analysis/gpt4o/tokenizer.py: new standalone script; imports analysis_batch for MODEL_LIMITS, estimate_tokens, build_messages. Reads input JSONL + prompt, computes per-model jobs/cost/time table, writes report.json to input file's directory. MODEL_PRICING dict lives here (not in analysis_batch).
|
||||
- analysis/gpt4o/analysis_batch.py: fully rewritten with four subcommands: create, submit, status, download. No longer uses REQUESTS_DIR / RAW_DIR / RUNS_DIR.
|
||||
- Job directories: analysis/gpt4o/jobs/<stem[:8]>-N/ (e.g. f452-1). Each run is self-contained: forum.jsonl, prompt.txt, report.json, jobN-input.jsonl, jobN-output-raw.jsonl, jobN-output.jsonl, jobN-errors.jsonl.
|
||||
- analysis/tokenizer.py: new standalone script; imports openai_batch for MODEL_LIMITS, estimate_tokens, build_messages. Reads input JSONL + prompt, computes per-model jobs/cost/time table, writes reports/<stem>-report.json. MODEL_PRICING dict lives here (not in openai_batch). Pass a jobN-input.jsonl to count actual tokens instead.
|
||||
- analysis/openai_batch.py: fully rewritten with four subcommands: create, submit, status, download. Job dirs at analysis/jobs/<stem[:8]>-N/.
|
||||
- Job directories: analysis/jobs/<stem[:8]>-N/ (e.g. f452-1). Each run is self-contained: forum.jsonl, prompt.txt, report.json, jobN-input.jsonl, jobN-output-raw.jsonl, jobN-output.jsonl, jobN-errors.jsonl.
|
||||
- status.json: tracks all jobs with pending/submitted/in_progress/completed/failed states. Updated by submit, status, download.
|
||||
- _find_next_eligible_job: pure function for testability. Returns (next_pending_job, None) or (None, warning). Blocks submission if previous job is in_progress/submitted.
|
||||
- create: no API key required. Reads report.json, re-chunks comments, writes all jobN-input.jsonl files, writes status.json.
|
||||
- submit: uploads jobN-input.jsonl to Files API, creates batch, updates status.json to 'submitted'. Will not stack batches.
|
||||
- status: retrieves batch from OpenAI, updates status.json counts and status.
|
||||
- download: auto-runs status first, downloads output_file_id → jobN-output-raw.jsonl, error_file_id → jobN-errors.jsonl, normalizes → jobN-output.jsonl. Updates status.json.
|
||||
- tests/test_tokenizer.py: 15 tests for compute_report schema, cost/time calculation, MODEL_PRICING coverage, print_table output, report.json round-trip.
|
||||
- tests/tokenizer.py: 19 tests for compute_report schema, cost/time calculation, MODEL_PRICING coverage, print_table output, count_input_tokens, report.json round-trip.
|
||||
- Token limit buffer: _LIMIT_BUFFER=0.80 (20% headroom). Estimate uses OpenAI cookbook chat formula (role tokens + 3-token reply primer). Verify a job file with: python analysis/tokenizer.py analysis/jobs/<dir>/jobN-input.jsonl
|
||||
|
||||
*** usage
|
||||
#+begin_src sh
|
||||
#+begin_src powershell
|
||||
# 1. estimate tokens and cost
|
||||
python analysis/gpt4o/tokenizer.py output/f452.jsonl --prompt analysis/prompt-1.txt
|
||||
# writes output/report.json
|
||||
python analysis/tokenizer.py output/f452.jsonl --prompt analysis/prompt-1.txt
|
||||
# writes reports/f452-report.json
|
||||
|
||||
# 2. create job directory (no api key needed)
|
||||
python analysis/gpt4o/analysis_batch.py create output/report.json --model gpt-4o-mini
|
||||
# creates analysis/gpt4o/jobs/f452-1/
|
||||
# 2. verify actual tokens in a job file (optional sanity check)
|
||||
python analysis/tokenizer.py analysis/jobs/f452-1/job1-input.jsonl
|
||||
|
||||
# 3. submit first job
|
||||
python analysis/gpt4o/analysis_batch.py submit
|
||||
# 3. create job directory (no api key needed)
|
||||
python analysis/openai_batch.py create reports/f452-report.json --model gpt-5.4-mini
|
||||
# creates analysis/jobs/f452-1/
|
||||
|
||||
# 4. check status (repeat until completed)
|
||||
python analysis/gpt4o/analysis_batch.py status
|
||||
# 4. submit first job
|
||||
python analysis/openai_batch.py submit
|
||||
|
||||
# 5. download and normalize
|
||||
python analysis/gpt4o/analysis_batch.py download
|
||||
# 5. check status (repeat until completed)
|
||||
python analysis/openai_batch.py status
|
||||
|
||||
# 6. submit next job (if multi-job run), then repeat 4-5
|
||||
python analysis/gpt4o/analysis_batch.py submit
|
||||
# 6. download and normalize
|
||||
python analysis/openai_batch.py download
|
||||
|
||||
# 7. submit next job (if multi-job run), then repeat 5-6
|
||||
python analysis/openai_batch.py submit
|
||||
#+end_src
|
||||
|
||||
** evidence
|
||||
- commit:
|
||||
- tests: passing (pytest tests/analysis_gpt4o_batch.py tests/test_tokenizer.py)
|
||||
- datetime: [2026-05-05 Tue]
|
||||
- tests: passing (pytest tests/openai_batch.py tests/openai_realtime.py tests/tokenizer.py)
|
||||
- datetime: [2026-05-06 Wed]
|
||||
|
||||
* === Backlog ===
|
||||
* [ ] X: analysis validation view
|
||||
|
||||
Reference in New Issue
Block a user