remove hyphen for underscore in nomenclature, remove dependency
This commit is contained in:
228
analysis/gpt4o/analysis_realtime.py
Normal file
228
analysis/gpt4o/analysis_realtime.py
Normal file
@@ -0,0 +1,228 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
analysis/gpt4o/analysis-realtime.py — Synchronous GPT-4o pipeline for VA Townhall comments.
|
||||
|
||||
Usage:
|
||||
python analysis/gpt4o/analysis-realtime.py <input_jsonl> [--limit {5,10,20,50}] [--model MODEL]
|
||||
|
||||
Output:
|
||||
analysis/gpt4o/forum{id}_{scrape_ts}_{model}_{run_ts}.jsonl
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import hashlib
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import uuid
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
|
||||
from dotenv import load_dotenv
|
||||
|
||||
try:
|
||||
import openai
|
||||
except ImportError:
|
||||
sys.exit("openai package not installed. Run: pip install openai")
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Prompt — loaded from analysis/prompt-1.txt at import time
|
||||
|
||||
_PROMPT_FILE = Path(__file__).parent.parent / "prompt-1.txt"
|
||||
SYSTEM_PROMPT = _PROMPT_FILE.read_text(encoding="utf-8").strip()
|
||||
PROMPT_VERSION = hashlib.sha256(SYSTEM_PROMPT.encode("utf-8")).hexdigest()[:7]
|
||||
|
||||
USER_TEMPLATE = """\
|
||||
## Proposed Regulation
|
||||
Title: {reg_title}
|
||||
Description: {reg_desc}
|
||||
|
||||
---
|
||||
|
||||
## Public Comment
|
||||
Comment ID: {comment_id}
|
||||
Title: {comment_title}
|
||||
Body:
|
||||
{comment_text}
|
||||
|
||||
---
|
||||
Classify this comment per the instructions. Return only JSON.\
|
||||
"""
|
||||
|
||||
MAX_COMMENT_CHARS = 6000
|
||||
_RETRY_DELAYS = [1.0, 2.0]
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Core functions
|
||||
|
||||
|
||||
def load_items(path: Path) -> tuple[dict | None, list[dict]]:
|
||||
"""Read a scraped JSONL file. Returns (forum_item_or_None, [comment_items])."""
|
||||
forum = None
|
||||
comments = []
|
||||
with open(path, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
item = json.loads(line)
|
||||
if "comment_id" in item:
|
||||
comments.append(item)
|
||||
elif "reg_title" in item:
|
||||
forum = item
|
||||
return forum, comments
|
||||
|
||||
|
||||
def build_messages(comment: dict, forum: dict | None) -> tuple[list, bool]:
|
||||
"""Build OpenAI messages for one comment. Returns (messages, truncated)."""
|
||||
reg_title = (forum or {}).get("reg_title", "[unknown]")
|
||||
reg_desc = (forum or {}).get("reg_desc", "[unknown]")
|
||||
|
||||
body = (comment.get("text") or "").strip()
|
||||
truncated = False
|
||||
if not body:
|
||||
body = "[No body text provided]"
|
||||
elif len(body) > MAX_COMMENT_CHARS:
|
||||
body = body[:MAX_COMMENT_CHARS] + "... [truncated]"
|
||||
truncated = True
|
||||
|
||||
user_text = USER_TEMPLATE.format(
|
||||
reg_title=reg_title,
|
||||
reg_desc=reg_desc,
|
||||
comment_id=comment.get("comment_id", ""),
|
||||
comment_title=comment.get("title", ""),
|
||||
comment_text=body,
|
||||
)
|
||||
|
||||
return [
|
||||
{"role": "system", "content": SYSTEM_PROMPT},
|
||||
{"role": "user", "content": user_text},
|
||||
], truncated
|
||||
|
||||
|
||||
def parse_api_response(content: str) -> dict:
|
||||
data = json.loads(content)
|
||||
keys = ("stance", "stance_confidence", "stance_rationale", "tone", "tags")
|
||||
return {k: data.get(k) for k in keys}
|
||||
|
||||
|
||||
def _call_api(client, messages: list, model: str) -> str:
|
||||
last_exc = None
|
||||
for delay in [0.0] + _RETRY_DELAYS:
|
||||
if delay:
|
||||
time.sleep(delay)
|
||||
try:
|
||||
resp = client.chat.completions.create(
|
||||
model=model,
|
||||
messages=messages,
|
||||
response_format={"type": "json_object"},
|
||||
temperature=0.0,
|
||||
)
|
||||
return resp.choices[0].message.content
|
||||
except openai.RateLimitError as exc:
|
||||
last_exc = exc
|
||||
raise last_exc # type: ignore[misc]
|
||||
|
||||
|
||||
def analyze_comment(client, comment: dict, forum: dict | None, run_id: str, model: str) -> dict:
|
||||
base = {
|
||||
"run_id": run_id,
|
||||
"forum_id": comment.get("forum_id", ""),
|
||||
"comment_id": comment.get("comment_id", ""),
|
||||
"analyzed_at": datetime.now(timezone.utc).isoformat(),
|
||||
"model": model,
|
||||
"prompt_version": PROMPT_VERSION,
|
||||
"input_title": comment.get("title", ""),
|
||||
}
|
||||
try:
|
||||
messages, truncated = build_messages(comment, forum)
|
||||
content = _call_api(client, messages, model)
|
||||
parsed = parse_api_response(content)
|
||||
return {**base, **parsed, "truncated": truncated, "error": None}
|
||||
except Exception as exc:
|
||||
return {
|
||||
**base,
|
||||
"stance": None, "stance_confidence": None,
|
||||
"stance_rationale": None, "tone": None, "tags": None,
|
||||
"truncated": False,
|
||||
"error": str(exc),
|
||||
}
|
||||
|
||||
|
||||
def _scrape_ts_from_filename(path: Path) -> str:
|
||||
m = re.search(r"(\d{4}-\d{2}-\d{2}T[\d\-+:]+)", path.stem)
|
||||
return m.group(1).replace(":", "-") if m else "unknown"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
|
||||
def main() -> None:
|
||||
load_dotenv()
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Analyze VA Townhall public comments with GPT-4o (synchronous).",
|
||||
)
|
||||
parser.add_argument("input", help="Path to scraped JSONL file")
|
||||
parser.add_argument(
|
||||
"--limit",
|
||||
type=int,
|
||||
choices=[5, 10, 20, 50],
|
||||
metavar="{5,10,20,50}",
|
||||
help="Process only the first N comments (for testing). Omit to process all.",
|
||||
)
|
||||
parser.add_argument("--model", default="gpt-4o", help="OpenAI model (default: gpt-4o)")
|
||||
args = parser.parse_args()
|
||||
|
||||
api_key = os.environ.get("OPENAI_API_KEY")
|
||||
if not api_key:
|
||||
sys.exit("OPENAI_API_KEY not set. Create a .env file or export the variable.")
|
||||
|
||||
input_path = Path(args.input)
|
||||
if not input_path.exists():
|
||||
sys.exit(f"File not found: {input_path}")
|
||||
|
||||
print(f"Reading {input_path} ...", file=sys.stderr)
|
||||
forum, comments = load_items(input_path)
|
||||
|
||||
if forum is None:
|
||||
print("Warning: no ForumItem found — regulation context will be [unknown].", file=sys.stderr)
|
||||
|
||||
if args.limit:
|
||||
comments = comments[: args.limit]
|
||||
|
||||
forum_id = (forum or {}).get("forum_id", "unknown")
|
||||
scrape_ts = _scrape_ts_from_filename(input_path)
|
||||
run_ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%S+00-00")
|
||||
model_slug = args.model.replace("/", "-")
|
||||
|
||||
out_dir = Path(__file__).parent
|
||||
out_path = out_dir / f"forum{forum_id}_{scrape_ts}_{model_slug}_{run_ts}.jsonl"
|
||||
|
||||
run_id = str(uuid.uuid4())
|
||||
client = openai.OpenAI(api_key=api_key)
|
||||
|
||||
n_ok = n_err = 0
|
||||
total = len(comments)
|
||||
print(f"Analyzing {total} comments → {out_path}", file=sys.stderr)
|
||||
|
||||
with open(out_path, "w", encoding="utf-8") as out:
|
||||
for i, comment in enumerate(comments, 1):
|
||||
record = analyze_comment(client, comment, forum, run_id, args.model)
|
||||
out.write(json.dumps(record, ensure_ascii=False) + "\n")
|
||||
out.flush()
|
||||
if record["error"]:
|
||||
n_err += 1
|
||||
print(f" [{i}/{total}] ERROR {comment.get('comment_id')}: {record['error']}", file=sys.stderr)
|
||||
else:
|
||||
n_ok += 1
|
||||
print(f" [{i}/{total}] OK {comment.get('comment_id')} → {record['stance']}", file=sys.stderr)
|
||||
time.sleep(0.1)
|
||||
|
||||
print(f"\nDone. {n_ok} ok, {n_err} errors → {out_path}", file=sys.stderr)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user