remove hyphen for underscore in nomenclature, remove dependency

2026-05-05 16:47:11 -04:00
parent fd9d656e13
commit 683bfb324f
5 changed files with 67 additions and 127 deletions
--- a/analysis/gpt4o/analysis_realtime.py
+++ b/analysis/gpt4o/analysis_realtime.py
@@ -0,0 +1,228 @@
+#!/usr/bin/env python3
+"""
+analysis/gpt4o/analysis-realtime.py — Synchronous GPT-4o pipeline for VA Townhall comments.
+
+Usage:
+    python analysis/gpt4o/analysis-realtime.py <input_jsonl> [--limit {5,10,20,50}] [--model MODEL]
+
+Output:
+    analysis/gpt4o/forum{id}_{scrape_ts}_{model}_{run_ts}.jsonl
+"""
+
+import argparse
+import hashlib
+import json
+import os
+import re
+import sys
+import time
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+
+from dotenv import load_dotenv
+
+try:
+    import openai
+except ImportError:
+    sys.exit("openai package not installed. Run: pip install openai")
+
+# ---------------------------------------------------------------------------
+# Prompt — loaded from analysis/prompt-1.txt at import time
+
+_PROMPT_FILE = Path(__file__).parent.parent / "prompt-1.txt"
+SYSTEM_PROMPT = _PROMPT_FILE.read_text(encoding="utf-8").strip()
+PROMPT_VERSION = hashlib.sha256(SYSTEM_PROMPT.encode("utf-8")).hexdigest()[:7]
+
+USER_TEMPLATE = """\
+## Proposed Regulation
+Title: {reg_title}
+Description: {reg_desc}
+
+---
+
+## Public Comment
+Comment ID: {comment_id}
+Title: {comment_title}
+Body:
+{comment_text}
+
+---
+Classify this comment per the instructions. Return only JSON.\
+"""
+
+MAX_COMMENT_CHARS = 6000
+_RETRY_DELAYS = [1.0, 2.0]
+
+# ---------------------------------------------------------------------------
+# Core functions
+
+
+def load_items(path: Path) -> tuple[dict | None, list[dict]]:
+    """Read a scraped JSONL file. Returns (forum_item_or_None, [comment_items])."""
+    forum = None
+    comments = []
+    with open(path, encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            item = json.loads(line)
+            if "comment_id" in item:
+                comments.append(item)
+            elif "reg_title" in item:
+                forum = item
+    return forum, comments
+
+
+def build_messages(comment: dict, forum: dict | None) -> tuple[list, bool]:
+    """Build OpenAI messages for one comment. Returns (messages, truncated)."""
+    reg_title = (forum or {}).get("reg_title", "[unknown]")
+    reg_desc  = (forum or {}).get("reg_desc",  "[unknown]")
+
+    body = (comment.get("text") or "").strip()
+    truncated = False
+    if not body:
+        body = "[No body text provided]"
+    elif len(body) > MAX_COMMENT_CHARS:
+        body = body[:MAX_COMMENT_CHARS] + "... [truncated]"
+        truncated = True
+
+    user_text = USER_TEMPLATE.format(
+        reg_title=reg_title,
+        reg_desc=reg_desc,
+        comment_id=comment.get("comment_id", ""),
+        comment_title=comment.get("title", ""),
+        comment_text=body,
+    )
+
+    return [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user",   "content": user_text},
+    ], truncated
+
+
+def parse_api_response(content: str) -> dict:
+    data = json.loads(content)
+    keys = ("stance", "stance_confidence", "stance_rationale", "tone", "tags")
+    return {k: data.get(k) for k in keys}
+
+
+def _call_api(client, messages: list, model: str) -> str:
+    last_exc = None
+    for delay in [0.0] + _RETRY_DELAYS:
+        if delay:
+            time.sleep(delay)
+        try:
+            resp = client.chat.completions.create(
+                model=model,
+                messages=messages,
+                response_format={"type": "json_object"},
+                temperature=0.0,
+            )
+            return resp.choices[0].message.content
+        except openai.RateLimitError as exc:
+            last_exc = exc
+    raise last_exc  # type: ignore[misc]
+
+
+def analyze_comment(client, comment: dict, forum: dict | None, run_id: str, model: str) -> dict:
+    base = {
+        "run_id":         run_id,
+        "forum_id":       comment.get("forum_id", ""),
+        "comment_id":     comment.get("comment_id", ""),
+        "analyzed_at":    datetime.now(timezone.utc).isoformat(),
+        "model":          model,
+        "prompt_version": PROMPT_VERSION,
+        "input_title":    comment.get("title", ""),
+    }
+    try:
+        messages, truncated = build_messages(comment, forum)
+        content = _call_api(client, messages, model)
+        parsed = parse_api_response(content)
+        return {**base, **parsed, "truncated": truncated, "error": None}
+    except Exception as exc:
+        return {
+            **base,
+            "stance": None, "stance_confidence": None,
+            "stance_rationale": None, "tone": None, "tags": None,
+            "truncated": False,
+            "error": str(exc),
+        }
+
+
+def _scrape_ts_from_filename(path: Path) -> str:
+    m = re.search(r"(\d{4}-\d{2}-\d{2}T[\d\-+:]+)", path.stem)
+    return m.group(1).replace(":", "-") if m else "unknown"
+
+
+# ---------------------------------------------------------------------------
+# CLI
+
+def main() -> None:
+    load_dotenv()
+
+    parser = argparse.ArgumentParser(
+        description="Analyze VA Townhall public comments with GPT-4o (synchronous).",
+    )
+    parser.add_argument("input", help="Path to scraped JSONL file")
+    parser.add_argument(
+        "--limit",
+        type=int,
+        choices=[5, 10, 20, 50],
+        metavar="{5,10,20,50}",
+        help="Process only the first N comments (for testing). Omit to process all.",
+    )
+    parser.add_argument("--model", default="gpt-4o", help="OpenAI model (default: gpt-4o)")
+    args = parser.parse_args()
+
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        sys.exit("OPENAI_API_KEY not set. Create a .env file or export the variable.")
+
+    input_path = Path(args.input)
+    if not input_path.exists():
+        sys.exit(f"File not found: {input_path}")
+
+    print(f"Reading {input_path} ...", file=sys.stderr)
+    forum, comments = load_items(input_path)
+
+    if forum is None:
+        print("Warning: no ForumItem found — regulation context will be [unknown].", file=sys.stderr)
+
+    if args.limit:
+        comments = comments[: args.limit]
+
+    forum_id   = (forum or {}).get("forum_id", "unknown")
+    scrape_ts  = _scrape_ts_from_filename(input_path)
+    run_ts     = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%S+00-00")
+    model_slug = args.model.replace("/", "-")
+
+    out_dir  = Path(__file__).parent
+    out_path = out_dir / f"forum{forum_id}_{scrape_ts}_{model_slug}_{run_ts}.jsonl"
+
+    run_id = str(uuid.uuid4())
+    client = openai.OpenAI(api_key=api_key)
+
+    n_ok = n_err = 0
+    total = len(comments)
+    print(f"Analyzing {total} comments → {out_path}", file=sys.stderr)
+
+    with open(out_path, "w", encoding="utf-8") as out:
+        for i, comment in enumerate(comments, 1):
+            record = analyze_comment(client, comment, forum, run_id, args.model)
+            out.write(json.dumps(record, ensure_ascii=False) + "\n")
+            out.flush()
+            if record["error"]:
+                n_err += 1
+                print(f"  [{i}/{total}] ERROR {comment.get('comment_id')}: {record['error']}", file=sys.stderr)
+            else:
+                n_ok += 1
+                print(f"  [{i}/{total}] OK    {comment.get('comment_id')} → {record['stance']}", file=sys.stderr)
+            time.sleep(0.1)
+
+    print(f"\nDone. {n_ok} ok, {n_err} errors → {out_path}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()