add gpt4o batch analysis
This commit is contained in:
420
analysis/gpt4o/analysis_batch.py
Normal file
420
analysis/gpt4o/analysis_batch.py
Normal file
@@ -0,0 +1,420 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
analysis/gpt4o/analysis-batch.py — OpenAI Batch API pipeline
|
||||||
|
|
||||||
|
Commands (run manually in order):
|
||||||
|
submit <input_jsonl> [--model gpt-4o] — build request file, upload, create batch
|
||||||
|
status <run_id> — check batch status, update manifest
|
||||||
|
download <run_id> — download + normalize output, update manifest
|
||||||
|
|
||||||
|
File layout (all under analysis/gpt4o/):
|
||||||
|
requests/<run_id>.jsonl — batch input sent to OpenAI
|
||||||
|
raw/<run_id>.jsonl — raw batch output from OpenAI
|
||||||
|
runs/<run_id>.json — run manifest
|
||||||
|
<run_id>_<model>.jsonl — normalized output (same schema as realtime)
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import hashlib
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sys
|
||||||
|
from datetime import datetime, timezone
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
try:
|
||||||
|
import openai
|
||||||
|
except ImportError:
|
||||||
|
sys.exit("openai package not installed. Run: pip install openai")
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Prompt
|
||||||
|
|
||||||
|
_DEFAULT_PROMPT_FILE = Path(__file__).parent.parent / "prompt-1.txt"
|
||||||
|
SYSTEM_PROMPT = _DEFAULT_PROMPT_FILE.read_text(encoding="utf-8").strip()
|
||||||
|
PROMPT_VERSION = hashlib.sha256(SYSTEM_PROMPT.encode("utf-8")).hexdigest()[:7]
|
||||||
|
|
||||||
|
|
||||||
|
def _load_prompt(path: Path) -> None:
|
||||||
|
"""Re-read a prompt file, updating module-level SYSTEM_PROMPT and PROMPT_VERSION."""
|
||||||
|
global SYSTEM_PROMPT, PROMPT_VERSION
|
||||||
|
SYSTEM_PROMPT = path.read_text(encoding="utf-8").strip()
|
||||||
|
PROMPT_VERSION = hashlib.sha256(SYSTEM_PROMPT.encode("utf-8")).hexdigest()[:7]
|
||||||
|
|
||||||
|
USER_TEMPLATE = """\
|
||||||
|
## Proposed Regulation
|
||||||
|
Title: {reg_title}
|
||||||
|
Description: {reg_desc}
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## Public Comment
|
||||||
|
Comment ID: {comment_id}
|
||||||
|
Title: {comment_title}
|
||||||
|
Body:
|
||||||
|
{comment_text}
|
||||||
|
|
||||||
|
---
|
||||||
|
Classify this comment per the instructions. Return only JSON.\
|
||||||
|
"""
|
||||||
|
|
||||||
|
MAX_COMMENT_CHARS = 6000
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Directories
|
||||||
|
|
||||||
|
_SCRIPT_DIR = Path(__file__).parent
|
||||||
|
REQUESTS_DIR = _SCRIPT_DIR / "requests"
|
||||||
|
RAW_DIR = _SCRIPT_DIR / "raw"
|
||||||
|
RUNS_DIR = _SCRIPT_DIR / "runs"
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Core functions (importable for tests)
|
||||||
|
|
||||||
|
|
||||||
|
def load_items(path: Path) -> tuple[dict | None, list[dict]]:
|
||||||
|
"""Read a scraped JSONL file. Returns (forum_item_or_None, [comment_items])."""
|
||||||
|
forum = None
|
||||||
|
comments = []
|
||||||
|
with open(path, encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
item = json.loads(line)
|
||||||
|
if "comment_id" in item:
|
||||||
|
comments.append(item)
|
||||||
|
elif "reg_title" in item:
|
||||||
|
forum = item
|
||||||
|
return forum, comments
|
||||||
|
|
||||||
|
|
||||||
|
def custom_id_from(comment_id: str) -> str:
|
||||||
|
return f"comment_{comment_id}"
|
||||||
|
|
||||||
|
|
||||||
|
def parse_custom_id(custom_id: str) -> str:
|
||||||
|
"""Return comment_id from a custom_id string."""
|
||||||
|
return custom_id.removeprefix("comment_")
|
||||||
|
|
||||||
|
|
||||||
|
def build_messages(comment: dict, forum: dict | None) -> tuple[list, bool]:
|
||||||
|
"""Build OpenAI messages for one comment. Returns (messages, truncated)."""
|
||||||
|
reg_title = (forum or {}).get("reg_title", "[unknown]")
|
||||||
|
reg_desc = (forum or {}).get("reg_desc", "[unknown]")
|
||||||
|
|
||||||
|
body = (comment.get("text") or "").strip()
|
||||||
|
truncated = False
|
||||||
|
if not body:
|
||||||
|
body = "[No body text provided]"
|
||||||
|
elif len(body) > MAX_COMMENT_CHARS:
|
||||||
|
body = body[:MAX_COMMENT_CHARS] + "... [truncated]"
|
||||||
|
truncated = True
|
||||||
|
|
||||||
|
user_text = USER_TEMPLATE.format(
|
||||||
|
reg_title=reg_title,
|
||||||
|
reg_desc=reg_desc,
|
||||||
|
comment_id=comment.get("comment_id", ""),
|
||||||
|
comment_title=comment.get("title", ""),
|
||||||
|
comment_text=body,
|
||||||
|
)
|
||||||
|
|
||||||
|
return [
|
||||||
|
{"role": "system", "content": SYSTEM_PROMPT},
|
||||||
|
{"role": "user", "content": user_text},
|
||||||
|
], truncated
|
||||||
|
|
||||||
|
|
||||||
|
def build_batch_request_line(comment: dict, forum: dict | None, model: str) -> dict:
|
||||||
|
"""Build one line of the batch input JSONL."""
|
||||||
|
messages, _ = build_messages(comment, forum)
|
||||||
|
return {
|
||||||
|
"custom_id": custom_id_from(comment["comment_id"]),
|
||||||
|
"method": "POST",
|
||||||
|
"url": "/v1/chat/completions",
|
||||||
|
"body": {
|
||||||
|
"model": model,
|
||||||
|
"messages": messages,
|
||||||
|
"response_format": {"type": "json_object"},
|
||||||
|
"temperature": 0.0,
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def normalize_output_line(
|
||||||
|
raw_line: dict,
|
||||||
|
comment_lookup: dict,
|
||||||
|
run_id: str,
|
||||||
|
analyzed_at: str,
|
||||||
|
model: str,
|
||||||
|
prompt_version: str,
|
||||||
|
) -> dict:
|
||||||
|
"""Convert one raw batch output line into a normalized analysis record.
|
||||||
|
|
||||||
|
comment_lookup: {comment_id: CommentItem dict}
|
||||||
|
prompt_version: taken from the run manifest so it reflects what was submitted.
|
||||||
|
"""
|
||||||
|
comment_id = parse_custom_id(raw_line.get("custom_id", ""))
|
||||||
|
comment = comment_lookup.get(comment_id, {})
|
||||||
|
|
||||||
|
base = {
|
||||||
|
"run_id": run_id,
|
||||||
|
"forum_id": comment.get("forum_id", ""),
|
||||||
|
"comment_id": comment_id,
|
||||||
|
"analyzed_at": analyzed_at,
|
||||||
|
"model": model,
|
||||||
|
"prompt_version": prompt_version,
|
||||||
|
"input_title": comment.get("title", ""),
|
||||||
|
"truncated": len(comment.get("text") or "") > MAX_COMMENT_CHARS,
|
||||||
|
}
|
||||||
|
|
||||||
|
# Check for outer-level batch error (e.g. batch_expired)
|
||||||
|
if raw_line.get("error"):
|
||||||
|
err = raw_line["error"]
|
||||||
|
err_msg = err.get("message", str(err)) if isinstance(err, dict) else str(err)
|
||||||
|
return {**base, "stance": None, "stance_confidence": None,
|
||||||
|
"stance_rationale": None, "tone": None, "tags": None, "error": err_msg}
|
||||||
|
|
||||||
|
response = raw_line.get("response") or {}
|
||||||
|
if response.get("status_code") != 200:
|
||||||
|
return {**base, "stance": None, "stance_confidence": None,
|
||||||
|
"stance_rationale": None, "tone": None, "tags": None,
|
||||||
|
"error": f"status {response.get('status_code')}"}
|
||||||
|
|
||||||
|
try:
|
||||||
|
content = response["body"]["choices"][0]["message"]["content"]
|
||||||
|
data = json.loads(content)
|
||||||
|
keys = ("stance", "stance_confidence", "stance_rationale", "tone", "tags")
|
||||||
|
parsed = {k: data.get(k) for k in keys}
|
||||||
|
return {**base, **parsed, "error": None}
|
||||||
|
except Exception as exc:
|
||||||
|
return {**base, "stance": None, "stance_confidence": None,
|
||||||
|
"stance_rationale": None, "tone": None, "tags": None, "error": str(exc)}
|
||||||
|
|
||||||
|
|
||||||
|
def make_manifest(
|
||||||
|
run_id: str,
|
||||||
|
input_filename: str,
|
||||||
|
input_sha256: str,
|
||||||
|
model: str,
|
||||||
|
batch_id: str,
|
||||||
|
records_submitted: int,
|
||||||
|
request_filename: str,
|
||||||
|
) -> dict:
|
||||||
|
return {
|
||||||
|
"run_id": run_id,
|
||||||
|
"input_filename": input_filename,
|
||||||
|
"input_sha256": input_sha256,
|
||||||
|
"prompt_hash": PROMPT_VERSION,
|
||||||
|
"model": model,
|
||||||
|
"batch_id": batch_id,
|
||||||
|
"records_submitted": records_submitted,
|
||||||
|
"records_completed": None,
|
||||||
|
"records_failed": None,
|
||||||
|
"request_filename": request_filename,
|
||||||
|
"raw_output_filename": None,
|
||||||
|
"normalized_output_filename": None,
|
||||||
|
"created_at": datetime.now(timezone.utc).isoformat(),
|
||||||
|
"completed_at": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def load_manifest(run_id: str) -> dict:
|
||||||
|
path = RUNS_DIR / f"{run_id}.json"
|
||||||
|
return json.loads(path.read_text(encoding="utf-8"))
|
||||||
|
|
||||||
|
|
||||||
|
def save_manifest(manifest: dict) -> None:
|
||||||
|
RUNS_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
path = RUNS_DIR / f"{manifest['run_id']}.json"
|
||||||
|
path.write_text(json.dumps(manifest, indent=2, ensure_ascii=False), encoding="utf-8")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Subcommand: submit
|
||||||
|
|
||||||
|
def cmd_submit(args, client) -> None:
|
||||||
|
_load_prompt(Path(args.prompt))
|
||||||
|
print(f"Prompt: {args.prompt} (version {PROMPT_VERSION})", file=sys.stderr)
|
||||||
|
|
||||||
|
input_path = Path(args.input)
|
||||||
|
if not input_path.exists():
|
||||||
|
sys.exit(f"File not found: {input_path}")
|
||||||
|
|
||||||
|
print(f"Reading {input_path} ...", file=sys.stderr)
|
||||||
|
forum, comments = load_items(input_path)
|
||||||
|
if not comments:
|
||||||
|
sys.exit("No comment items found in input file.")
|
||||||
|
if forum is None:
|
||||||
|
print("Warning: no ForumItem found — regulation context will be [unknown].", file=sys.stderr)
|
||||||
|
|
||||||
|
import uuid
|
||||||
|
run_id = str(uuid.uuid4())
|
||||||
|
input_sha256 = hashlib.sha256(input_path.read_bytes()).hexdigest()
|
||||||
|
|
||||||
|
# Build batch request file
|
||||||
|
REQUESTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
request_path = REQUESTS_DIR / f"{run_id}.jsonl"
|
||||||
|
with open(request_path, "w", encoding="utf-8") as f:
|
||||||
|
for comment in comments:
|
||||||
|
line = build_batch_request_line(comment, forum, args.model)
|
||||||
|
f.write(json.dumps(line, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
|
print(f"Wrote {len(comments)} requests → {request_path}", file=sys.stderr)
|
||||||
|
|
||||||
|
# Upload to OpenAI
|
||||||
|
print("Uploading request file ...", file=sys.stderr)
|
||||||
|
with open(request_path, "rb") as f:
|
||||||
|
uploaded = client.files.create(file=f, purpose="batch")
|
||||||
|
print(f"Uploaded: {uploaded.id}", file=sys.stderr)
|
||||||
|
|
||||||
|
# Create batch
|
||||||
|
batch = client.batches.create(
|
||||||
|
input_file_id=uploaded.id,
|
||||||
|
endpoint="/v1/chat/completions",
|
||||||
|
completion_window="24h",
|
||||||
|
metadata={"run_id": run_id, "input_filename": str(input_path)},
|
||||||
|
)
|
||||||
|
print(f"Batch created: {batch.id} status={batch.status}", file=sys.stderr)
|
||||||
|
|
||||||
|
# Save manifest
|
||||||
|
manifest = make_manifest(
|
||||||
|
run_id=run_id,
|
||||||
|
input_filename=str(input_path),
|
||||||
|
input_sha256=input_sha256,
|
||||||
|
model=args.model,
|
||||||
|
batch_id=batch.id,
|
||||||
|
records_submitted=len(comments),
|
||||||
|
request_filename=str(request_path),
|
||||||
|
)
|
||||||
|
save_manifest(manifest)
|
||||||
|
|
||||||
|
print(f"\nrun_id: {run_id}", file=sys.stderr)
|
||||||
|
print(f"Check status: python analysis/gpt4o/analysis-batch.py status {run_id}", file=sys.stderr)
|
||||||
|
print(run_id) # stdout for scripting
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Subcommand: status
|
||||||
|
|
||||||
|
def cmd_status(args, client) -> None:
|
||||||
|
manifest = load_manifest(args.run_id)
|
||||||
|
batch = client.batches.retrieve(manifest["batch_id"])
|
||||||
|
|
||||||
|
counts = batch.request_counts
|
||||||
|
print(f"status: {batch.status}")
|
||||||
|
print(f"completed: {counts.completed}/{counts.total}")
|
||||||
|
print(f"failed: {counts.failed}")
|
||||||
|
|
||||||
|
manifest["records_completed"] = counts.completed
|
||||||
|
manifest["records_failed"] = counts.failed
|
||||||
|
save_manifest(manifest)
|
||||||
|
|
||||||
|
if batch.status == "completed":
|
||||||
|
print(f"\nReady to download. Run:")
|
||||||
|
print(f" python analysis/gpt4o/analysis-batch.py download {args.run_id}")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Subcommand: download
|
||||||
|
|
||||||
|
def cmd_download(args, client) -> None:
|
||||||
|
manifest = load_manifest(args.run_id)
|
||||||
|
batch = client.batches.retrieve(manifest["batch_id"])
|
||||||
|
|
||||||
|
if batch.status != "completed":
|
||||||
|
sys.exit(f"Batch not complete yet (status={batch.status}). Run 'status' to check.")
|
||||||
|
|
||||||
|
run_id = manifest["run_id"]
|
||||||
|
model = manifest["model"]
|
||||||
|
model_slug = model.replace("/", "-")
|
||||||
|
|
||||||
|
# Download raw output
|
||||||
|
RAW_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
raw_path = RAW_DIR / f"{run_id}.jsonl"
|
||||||
|
raw_text = client.files.content(batch.output_file_id).text
|
||||||
|
raw_path.write_text(raw_text, encoding="utf-8")
|
||||||
|
print(f"Raw output → {raw_path}", file=sys.stderr)
|
||||||
|
|
||||||
|
# Build comment lookup from original input for reconciliation
|
||||||
|
input_path = Path(manifest["input_filename"])
|
||||||
|
_, comments = load_items(input_path)
|
||||||
|
comment_lookup = {c["comment_id"]: c for c in comments}
|
||||||
|
|
||||||
|
# Normalize
|
||||||
|
completed_at = datetime.now(timezone.utc).isoformat()
|
||||||
|
if batch.completed_at:
|
||||||
|
completed_at = datetime.fromtimestamp(batch.completed_at, tz=timezone.utc).isoformat()
|
||||||
|
|
||||||
|
normalized_path = _SCRIPT_DIR / f"{run_id}_{model_slug}.jsonl"
|
||||||
|
n_ok = n_err = 0
|
||||||
|
with open(normalized_path, "w", encoding="utf-8") as out:
|
||||||
|
for line in raw_text.splitlines():
|
||||||
|
if not line.strip():
|
||||||
|
continue
|
||||||
|
raw_line = json.loads(line)
|
||||||
|
record = normalize_output_line(raw_line, comment_lookup, run_id, completed_at, model, manifest["prompt_hash"])
|
||||||
|
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||||
|
if record["error"]:
|
||||||
|
n_err += 1
|
||||||
|
else:
|
||||||
|
n_ok += 1
|
||||||
|
|
||||||
|
print(f"Normalized → {normalized_path} ({n_ok} ok, {n_err} errors)", file=sys.stderr)
|
||||||
|
|
||||||
|
manifest["records_completed"] = n_ok
|
||||||
|
manifest["records_failed"] = n_err
|
||||||
|
manifest["raw_output_filename"] = str(raw_path)
|
||||||
|
manifest["normalized_output_filename"] = str(normalized_path)
|
||||||
|
manifest["completed_at"] = completed_at
|
||||||
|
save_manifest(manifest)
|
||||||
|
print(f"Manifest updated → {RUNS_DIR / run_id}.json", file=sys.stderr)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# CLI
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
api_key = os.environ.get("OPENAI_API_KEY")
|
||||||
|
if not api_key:
|
||||||
|
sys.exit("OPENAI_API_KEY not set. Create a .env file or export the variable.")
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Public comment batch analysis pipeline.",
|
||||||
|
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||||
|
epilog=__doc__,
|
||||||
|
)
|
||||||
|
sub = parser.add_subparsers(dest="command", required=True)
|
||||||
|
|
||||||
|
p_submit = sub.add_parser("submit", help="Build and submit a batch job")
|
||||||
|
p_submit.add_argument("input", help="Path to scraped JSONL file")
|
||||||
|
p_submit.add_argument("--model", default="gpt-4o", help="OpenAI model (default: gpt-4o)")
|
||||||
|
p_submit.add_argument(
|
||||||
|
"--prompt",
|
||||||
|
default=str(_DEFAULT_PROMPT_FILE),
|
||||||
|
help="Path to system prompt file (default: analysis/prompt-1.txt)",
|
||||||
|
)
|
||||||
|
|
||||||
|
p_status = sub.add_parser("status", help="Check batch status")
|
||||||
|
p_status.add_argument("run_id", help="run_id from submit output")
|
||||||
|
|
||||||
|
p_download = sub.add_parser("download", help="Download and normalize completed batch")
|
||||||
|
p_download.add_argument("run_id", help="run_id from submit output")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
client = openai.OpenAI(api_key=api_key)
|
||||||
|
|
||||||
|
if args.command == "submit":
|
||||||
|
cmd_submit(args, client)
|
||||||
|
elif args.command == "status":
|
||||||
|
cmd_status(args, client)
|
||||||
|
elif args.command == "download":
|
||||||
|
cmd_download(args, client)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
9083
analysis/gpt4o/requests/5b8714a7-0666-40a2-9d69-2d9ce9074406.jsonl
Normal file
9083
analysis/gpt4o/requests/5b8714a7-0666-40a2-9d69-2d9ce9074406.jsonl
Normal file
File diff suppressed because one or more lines are too long
@@ -0,0 +1,16 @@
|
|||||||
|
{
|
||||||
|
"run_id": "5b8714a7-0666-40a2-9d69-2d9ce9074406",
|
||||||
|
"input_filename": "output\\f452.jsonl",
|
||||||
|
"input_sha256": "59dcc8b13cc2a386977a8b934c498c7e639b7e684a94ca1bfd10a14878670018",
|
||||||
|
"prompt_hash": "cb41250",
|
||||||
|
"model": "gpt-4o",
|
||||||
|
"batch_id": "batch_69fa579c7cd081909c049715838df6c6",
|
||||||
|
"records_submitted": 9083,
|
||||||
|
"records_completed": 0,
|
||||||
|
"records_failed": 0,
|
||||||
|
"request_filename": "C:\\Users\\moses\\projects\\vath\\analysis\\gpt4o\\requests\\5b8714a7-0666-40a2-9d69-2d9ce9074406.jsonl",
|
||||||
|
"raw_output_filename": null,
|
||||||
|
"normalized_output_filename": null,
|
||||||
|
"created_at": "2026-05-05T20:48:28.268022+00:00",
|
||||||
|
"completed_at": null
|
||||||
|
}
|
||||||
23
analysis/prompt-1.txt
Normal file
23
analysis/prompt-1.txt
Normal file
@@ -0,0 +1,23 @@
|
|||||||
|
You are an expert policy analyst classifying public comments submitted to the Virginia Town Hall
|
||||||
|
regulatory comment system. You will be given the text of a proposed regulation and a single
|
||||||
|
public comment. Return ONLY a JSON object — no other text.
|
||||||
|
|
||||||
|
Definitions:
|
||||||
|
- stance: the commenter's position on whether the regulation should be adopted.
|
||||||
|
"support" = wants it approved (as-is or with changes);
|
||||||
|
"oppose" = wants it rejected or substantially weakened;
|
||||||
|
"neutral" = takes no position, asks a question, or provides factual input only;
|
||||||
|
"unknown" = too vague, off-topic, or uninterpretable to classify.
|
||||||
|
- tone: the emotional register of the writing, independent of stance.
|
||||||
|
"positive" = affirming, hopeful, appreciative;
|
||||||
|
"negative" = angry, fearful, alarmed, or contemptuous;
|
||||||
|
"neutral" = matter-of-fact, procedural, or informational;
|
||||||
|
"mixed" = contains both positive and negative emotional content;
|
||||||
|
"unclear" = tone cannot be determined (e.g., a one-word comment).
|
||||||
|
- stance_confidence: float 0.0-1.0, your confidence in the stance label.
|
||||||
|
- stance_rationale: 1-3 sentences explaining the key evidence; quote specific phrases where possible.
|
||||||
|
- tags: up to 5 short topic labels relevant to the comment's specific concerns (e.g.
|
||||||
|
"parental rights", "student safety", "privacy", "religious freedom", "LGBTQ+ inclusion",
|
||||||
|
"bullying prevention", "school sports", "bathroom access"). Empty array if none apply.
|
||||||
|
|
||||||
|
Return exactly these keys: stance, stance_confidence, stance_rationale, tone, tags.
|
||||||
@@ -1,3 +1,7 @@
|
|||||||
|
#+title: VATH Task Log
|
||||||
|
#+date: [2026-05-05 Tue]
|
||||||
|
#+startup: Overview
|
||||||
|
|
||||||
* [X] t1.1: scrape one forum (1)
|
* [X] t1.1: scrape one forum (1)
|
||||||
Use https://www.townhall.virginia.gov/L/comments.cfm?GDocForumID=452 as the first forum. Scraper should be run manually at this step.
|
Use https://www.townhall.virginia.gov/L/comments.cfm?GDocForumID=452 as the first forum. Scraper should be run manually at this step.
|
||||||
ViewComments (townhall.virginia.gov/L/ViewComments.cfm?CommentID=#) appears to be raw list of all comments on forum - could be useful later for whole-scrape
|
ViewComments (townhall.virginia.gov/L/ViewComments.cfm?CommentID=#) appears to be raw list of all comments on forum - could be useful later for whole-scrape
|
||||||
@@ -68,21 +72,38 @@ Should be run manually, separate from scraper. You may use scrapy, but are not r
|
|||||||
|
|
||||||
** evidence
|
** evidence
|
||||||
- commit: d834d18
|
- commit: d834d18
|
||||||
- tests: 20 passing (pytest tests/test_gpt4o_analysis.py), 28 total across suite
|
- tests: 20 passing (pytest tests/analysis_gpt4o_realtime.py), 28 total across suite
|
||||||
- `python ./analysis/gpt4o/analysis.py --limit 5 ./output/f452.jsonl`
|
- `python ./analysis/gpt4o/analysis_realtime.py --limit 5 ./output/f452.jsonl`
|
||||||
- see: ./analysis/gpt4o/forum452_unknown_gpt-4o_2026-05-05T18-48-32+00-00.jsonl
|
- see: ./analysis/gpt4o/forum452_unknown_gpt-4o_2026-05-05T18-48-32+00-00.jsonl
|
||||||
- date: [2026-05-05 Tue 15:00]
|
- date: [2026-05-05 Tue 15:00]
|
||||||
|
|
||||||
* [ ] t1.2.1: 4o with batch processing
|
* [ ] t1.2.1: batch processing
|
||||||
|
Create analysis-batch.py to capture same elements as t1.2 above.
|
||||||
|
May need to add multiple commands to upload, check batch status, download, etc.
|
||||||
|
Commands should all be run manually.
|
||||||
|
Reference: ./docs/openai-batch.md. openai batch output order is not guaranteed, so custom_id is mandatory for reconciliation
|
||||||
** acceptance criteria
|
** acceptance criteria
|
||||||
1. input scraped jsonl doc by filename/path, and process the whole thing via batch processing
|
1. input scraped jsonl doc by filename/path, and process the whole thing via batch processing
|
||||||
|
- ignore non-comment items in jsonl
|
||||||
|
- do not modify raw scraper output
|
||||||
|
- specify model and prompt
|
||||||
|
2. output a run manifest in ./analysis/<model>/runs/<run_id>.json
|
||||||
|
- include: include run_id, input_filename, input_sha256, prompt_hash, model, batch_id, records_submitted, records_completed, records_failed, request_filename, raw_output_filename, normalized_output_filename, created_at, completed_at
|
||||||
|
3. add tests without live api calls
|
||||||
** notes
|
** notes
|
||||||
|
- analysis/gpt4o/analysis-batch.py with three subcommands:
|
||||||
|
- `submit`: reads scraped JSONL, builds batch request file (requests/<run_id>.jsonl), uploads to Files API, creates batch, saves manifest to runs/<run_id>.json. Prints run_id to stdout for scripting.
|
||||||
|
- `status`: retrieves batch from OpenAI, prints status + counts, updates manifest.
|
||||||
|
- `download`: downloads raw output to raw/<run_id>.jsonl, normalizes to <run_id>_<model>.jsonl using comment_lookup keyed by comment_id for reconciliation (batch output order not guaranteed). Updates manifest with filenames, counts, completed_at.
|
||||||
|
- custom_id format: comment_{comment_id} — unique within a forum, stable across runs.
|
||||||
|
- PROMPT_VERSION derived from analysis/prompt-1.txt (same file as realtime); both scripts produce matching prompt_hash in all records.
|
||||||
|
- analysis/prompt-1.txt: system prompt as plaintext, read at import time by both scripts. Edit here to change prompt for both pipelines.
|
||||||
|
- Tests use importlib.util to load hyphenated filenames; monkeypatch for RUNS_DIR in save/load test.
|
||||||
|
|
||||||
** evidence
|
** evidence
|
||||||
- commit:
|
- commit:
|
||||||
- tests:
|
- tests: 18 passing (pytest tests/analysis_gpt4o_batch.py), 46 total across suite
|
||||||
- date:
|
- datetime: [2026-05-05 Tue 17:00]
|
||||||
|
|
||||||
* [ ] X: complete proposal information
|
* [ ] X: complete proposal information
|
||||||
Ensure we capture as much useful information as possible about the actual proposal - contact information, etc. what the state actually says about what was posted.
|
Ensure we capture as much useful information as possible about the actual proposal - contact information, etc. what the state actually says about what was posted.
|
||||||
|
|||||||
5
pytest.ini
Normal file
5
pytest.ini
Normal file
@@ -0,0 +1,5 @@
|
|||||||
|
[pytest]
|
||||||
|
testpaths = tests
|
||||||
|
python_files = *.py
|
||||||
|
python_classes = Test*
|
||||||
|
python_functions = test_*
|
||||||
252
tests/analysis_gpt4o_batch.py
Normal file
252
tests/analysis_gpt4o_batch.py
Normal file
@@ -0,0 +1,252 @@
|
|||||||
|
"""Unit tests for analysis/gpt4o/analysis_batch.py — no real API calls."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
from unittest.mock import MagicMock
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent / "analysis" / "gpt4o"))
|
||||||
|
import analysis_batch as bt
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Fixtures
|
||||||
|
|
||||||
|
FORUM_ITEM = {
|
||||||
|
"forum_id": "452",
|
||||||
|
"reg_title": "Model Policies for Transgender Students",
|
||||||
|
"reg_desc": "Guidance developed in response to HB 145.",
|
||||||
|
}
|
||||||
|
|
||||||
|
COMMENT_ITEM = {
|
||||||
|
"forum_id": "452",
|
||||||
|
"comment_id": "87914",
|
||||||
|
"author": "Alice Example",
|
||||||
|
"date": "2021-01-04T09:15:00",
|
||||||
|
"title": "I support this policy",
|
||||||
|
"text": "This is a great policy that protects students.",
|
||||||
|
}
|
||||||
|
|
||||||
|
RAW_SUCCESS_LINE = {
|
||||||
|
"id": "batch_req_001",
|
||||||
|
"custom_id": "comment_87914",
|
||||||
|
"response": {
|
||||||
|
"status_code": 200,
|
||||||
|
"request_id": "req_abc",
|
||||||
|
"body": {
|
||||||
|
"id": "chatcmpl-xyz",
|
||||||
|
"choices": [{
|
||||||
|
"index": 0,
|
||||||
|
"message": {
|
||||||
|
"role": "assistant",
|
||||||
|
"content": json.dumps({
|
||||||
|
"stance": "support",
|
||||||
|
"stance_confidence": 0.95,
|
||||||
|
"stance_rationale": "Commenter explicitly endorses the policy.",
|
||||||
|
"tone": "positive",
|
||||||
|
"tags": ["student safety"],
|
||||||
|
}),
|
||||||
|
},
|
||||||
|
"finish_reason": "stop",
|
||||||
|
}],
|
||||||
|
},
|
||||||
|
},
|
||||||
|
"error": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
RAW_ERROR_LINE = {
|
||||||
|
"id": "batch_req_002",
|
||||||
|
"custom_id": "comment_87914",
|
||||||
|
"response": None,
|
||||||
|
"error": {"code": "batch_expired", "message": "This request could not be executed."},
|
||||||
|
}
|
||||||
|
|
||||||
|
RAW_HTTP_ERROR_LINE = {
|
||||||
|
"id": "batch_req_003",
|
||||||
|
"custom_id": "comment_87914",
|
||||||
|
"response": {"status_code": 400, "body": {}},
|
||||||
|
"error": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
COMMENT_LOOKUP = {"87914": COMMENT_ITEM}
|
||||||
|
ANALYZED_AT = "2026-05-05T18:00:00+00:00"
|
||||||
|
RUN_ID = "test-run-id-123"
|
||||||
|
MODEL = "gpt-4o"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Prompt versioning (batch reads the same prompt file)
|
||||||
|
|
||||||
|
def test_prompt_version_is_7_hex_chars():
|
||||||
|
assert len(bt.PROMPT_VERSION) == 7
|
||||||
|
assert all(c in "0123456789abcdef" for c in bt.PROMPT_VERSION)
|
||||||
|
|
||||||
|
|
||||||
|
def test_prompt_version_matches_realtime():
|
||||||
|
"""Both scripts must derive the same PROMPT_VERSION from the same file."""
|
||||||
|
import analysis_realtime as rt
|
||||||
|
assert bt.PROMPT_VERSION == rt.PROMPT_VERSION
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# custom_id helpers
|
||||||
|
|
||||||
|
def test_custom_id_from():
|
||||||
|
assert bt.custom_id_from("87914") == "comment_87914"
|
||||||
|
|
||||||
|
|
||||||
|
def test_parse_custom_id():
|
||||||
|
assert bt.parse_custom_id("comment_87914") == "87914"
|
||||||
|
|
||||||
|
|
||||||
|
def test_custom_id_round_trip():
|
||||||
|
cid = "12345"
|
||||||
|
assert bt.parse_custom_id(bt.custom_id_from(cid)) == cid
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# build_batch_request_line
|
||||||
|
|
||||||
|
def test_batch_request_line_structure():
|
||||||
|
line = bt.build_batch_request_line(COMMENT_ITEM, FORUM_ITEM, "gpt-4o")
|
||||||
|
assert line["custom_id"] == "comment_87914"
|
||||||
|
assert line["method"] == "POST"
|
||||||
|
assert line["url"] == "/v1/chat/completions"
|
||||||
|
assert line["body"]["model"] == "gpt-4o"
|
||||||
|
assert line["body"]["temperature"] == 0.0
|
||||||
|
assert line["body"]["response_format"] == {"type": "json_object"}
|
||||||
|
messages = line["body"]["messages"]
|
||||||
|
assert messages[0]["role"] == "system"
|
||||||
|
assert messages[1]["role"] == "user"
|
||||||
|
|
||||||
|
|
||||||
|
def test_batch_request_line_includes_reg_context():
|
||||||
|
line = bt.build_batch_request_line(COMMENT_ITEM, FORUM_ITEM, "gpt-4o")
|
||||||
|
user_content = line["body"]["messages"][1]["content"]
|
||||||
|
assert "Model Policies for Transgender Students" in user_content
|
||||||
|
assert "HB 145" in user_content
|
||||||
|
|
||||||
|
|
||||||
|
def test_batch_request_line_truncation():
|
||||||
|
long_comment = {**COMMENT_ITEM, "text": "x" * 7000}
|
||||||
|
line = bt.build_batch_request_line(long_comment, FORUM_ITEM, "gpt-4o")
|
||||||
|
user_content = line["body"]["messages"][1]["content"]
|
||||||
|
assert "... [truncated]" in user_content
|
||||||
|
assert user_content.count("x") == bt.MAX_COMMENT_CHARS
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# normalize_output_line — success
|
||||||
|
|
||||||
|
def test_normalize_success_all_keys():
|
||||||
|
record = bt.normalize_output_line(RAW_SUCCESS_LINE, COMMENT_LOOKUP, RUN_ID, ANALYZED_AT, MODEL, bt.PROMPT_VERSION)
|
||||||
|
required = {
|
||||||
|
"run_id", "forum_id", "comment_id", "analyzed_at", "model", "prompt_version",
|
||||||
|
"stance", "stance_confidence", "stance_rationale", "tone", "tags",
|
||||||
|
"input_title", "truncated", "error",
|
||||||
|
}
|
||||||
|
assert required == set(record.keys())
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_success_values():
|
||||||
|
record = bt.normalize_output_line(RAW_SUCCESS_LINE, COMMENT_LOOKUP, RUN_ID, ANALYZED_AT, MODEL, bt.PROMPT_VERSION)
|
||||||
|
assert record["stance"] == "support"
|
||||||
|
assert record["tone"] == "positive"
|
||||||
|
assert record["comment_id"] == "87914"
|
||||||
|
assert record["run_id"] == RUN_ID
|
||||||
|
assert record["analyzed_at"] == ANALYZED_AT
|
||||||
|
assert record["error"] is None
|
||||||
|
assert record["truncated"] is False
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_success_input_title():
|
||||||
|
record = bt.normalize_output_line(RAW_SUCCESS_LINE, COMMENT_LOOKUP, RUN_ID, ANALYZED_AT, MODEL, bt.PROMPT_VERSION)
|
||||||
|
assert record["input_title"] == COMMENT_ITEM["title"]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# normalize_output_line — errors
|
||||||
|
|
||||||
|
def test_normalize_batch_expired_error():
|
||||||
|
record = bt.normalize_output_line(RAW_ERROR_LINE, COMMENT_LOOKUP, RUN_ID, ANALYZED_AT, MODEL, bt.PROMPT_VERSION)
|
||||||
|
assert record["error"] is not None
|
||||||
|
assert "could not be executed" in record["error"]
|
||||||
|
assert record["stance"] is None
|
||||||
|
assert record["tone"] is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_http_error():
|
||||||
|
record = bt.normalize_output_line(RAW_HTTP_ERROR_LINE, COMMENT_LOOKUP, RUN_ID, ANALYZED_AT, MODEL, bt.PROMPT_VERSION)
|
||||||
|
assert record["error"] is not None
|
||||||
|
assert record["stance"] is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_malformed_json_in_response():
|
||||||
|
bad_line = {
|
||||||
|
"id": "batch_req_004",
|
||||||
|
"custom_id": "comment_87914",
|
||||||
|
"response": {
|
||||||
|
"status_code": 200,
|
||||||
|
"body": {"choices": [{"message": {"content": "not valid json{{{"}}]},
|
||||||
|
},
|
||||||
|
"error": None,
|
||||||
|
}
|
||||||
|
record = bt.normalize_output_line(bad_line, COMMENT_LOOKUP, RUN_ID, ANALYZED_AT, MODEL, bt.PROMPT_VERSION)
|
||||||
|
assert record["error"] is not None
|
||||||
|
assert record["stance"] is None
|
||||||
|
|
||||||
|
|
||||||
|
def test_normalize_unknown_comment_id():
|
||||||
|
"""A custom_id not in lookup yields empty forum_id and title but doesn't crash."""
|
||||||
|
record = bt.normalize_output_line(RAW_SUCCESS_LINE, {}, RUN_ID, ANALYZED_AT, MODEL, bt.PROMPT_VERSION)
|
||||||
|
assert record["comment_id"] == "87914"
|
||||||
|
assert record["forum_id"] == ""
|
||||||
|
assert record["input_title"] == ""
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Manifest
|
||||||
|
|
||||||
|
def test_make_manifest_all_keys():
|
||||||
|
m = bt.make_manifest(
|
||||||
|
run_id=RUN_ID,
|
||||||
|
input_filename="output/forum452.jsonl",
|
||||||
|
input_sha256="abc123",
|
||||||
|
model="gpt-4o",
|
||||||
|
batch_id="batch_xyz",
|
||||||
|
records_submitted=100,
|
||||||
|
request_filename="analysis/gpt4o/requests/test-run-id-123.jsonl",
|
||||||
|
)
|
||||||
|
required = {
|
||||||
|
"run_id", "input_filename", "input_sha256", "prompt_hash", "model",
|
||||||
|
"batch_id", "records_submitted", "records_completed", "records_failed",
|
||||||
|
"request_filename", "raw_output_filename", "normalized_output_filename",
|
||||||
|
"created_at", "completed_at",
|
||||||
|
}
|
||||||
|
assert required == set(m.keys())
|
||||||
|
|
||||||
|
|
||||||
|
def test_make_manifest_initial_nulls():
|
||||||
|
m = bt.make_manifest(
|
||||||
|
run_id=RUN_ID, input_filename="f", input_sha256="s",
|
||||||
|
model="gpt-4o", batch_id="b", records_submitted=10, request_filename="r",
|
||||||
|
)
|
||||||
|
assert m["records_completed"] is None
|
||||||
|
assert m["records_failed"] is None
|
||||||
|
assert m["raw_output_filename"] is None
|
||||||
|
assert m["normalized_output_filename"] is None
|
||||||
|
assert m["completed_at"] is None
|
||||||
|
assert m["prompt_hash"] == bt.PROMPT_VERSION
|
||||||
|
|
||||||
|
|
||||||
|
def test_manifest_save_load_roundtrip(tmp_path, monkeypatch):
|
||||||
|
monkeypatch.setattr(bt, "RUNS_DIR", tmp_path)
|
||||||
|
m = bt.make_manifest(
|
||||||
|
run_id=RUN_ID, input_filename="f", input_sha256="s",
|
||||||
|
model="gpt-4o", batch_id="b", records_submitted=42, request_filename="r",
|
||||||
|
)
|
||||||
|
bt.save_manifest(m)
|
||||||
|
loaded = bt.load_manifest(RUN_ID)
|
||||||
|
assert loaded == m
|
||||||
Reference in New Issue
Block a user