gpt4o 1.2 cleanup

2026-05-05 15:12:32 -04:00
parent 490c642bd9
commit 122c1ce939
3 changed files with 387 additions and 2 deletions
--- a/analysis/gpt4o/analysis-realtime.py
+++ b/analysis/gpt4o/analysis-realtime.py
@@ -0,0 +1,283 @@
+#!/usr/bin/env python3
+"""
+analysis/gpt4o/analysis.py — Manual GPT-4o sentiment pipeline for VA Townhall comments.
+
+Usage:
+    python analysis/gpt4o/analysis.py <input_jsonl> [--limit {5,10,20,50}] [--model MODEL]
+
+Output:
+    analysis/gpt4o/forum{id}_{scrape_ts}_{model}_{run_ts}.jsonl
+"""
+
+import argparse
+import hashlib
+import json
+import os
+import re
+import sys
+import time
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+
+from dotenv import load_dotenv
+
+try:
+    import openai
+except ImportError:
+    sys.exit("openai package not installed. Run: pip install openai")
+
+# ---------------------------------------------------------------------------
+# Prompt (version is derived from the content — changing either string changes PROMPT_VERSION)
+
+SYSTEM_PROMPT = """\
+You are an expert policy analyst classifying public comments submitted to the Virginia Town Hall
+regulatory comment system. You will be given the text of a proposed regulation and a single
+public comment. Return ONLY a JSON object — no other text.
+
+Definitions:
+- stance: the commenter's position on whether the regulation should be adopted.
+  "support" = wants it approved (as-is or with changes);
+  "oppose"  = wants it rejected or substantially weakened;
+  "neutral" = takes no position, asks a question, or provides factual input only;
+  "unknown" = too vague, off-topic, or uninterpretable to classify.
+- tone: the emotional register of the writing, independent of stance.
+  "positive" = affirming, hopeful, appreciative;
+  "negative" = angry, fearful, alarmed, or contemptuous;
+  "neutral"  = matter-of-fact, procedural, or informational;
+  "mixed"    = contains both positive and negative emotional content;
+  "unclear"  = tone cannot be determined (e.g., a one-word comment).
+- stance_confidence: float 0.0–1.0, your confidence in the stance label.
+- stance_rationale: 1–3 sentences explaining the key evidence; quote specific phrases where possible.
+- tags: up to 5 short topic labels relevant to the comment's specific concerns (e.g.
+  "parental rights", "student safety", "privacy", "religious freedom", "LGBTQ+ inclusion",
+  "bullying prevention", "school sports", "bathroom access"). Empty array if none apply.
+
+Return exactly these keys: stance, stance_confidence, stance_rationale, tone, tags.\
+"""
+
+USER_TEMPLATE = """\
+## Proposed Regulation
+Title: {reg_title}
+Description: {reg_desc}
+
+---
+
+## Public Comment
+Comment ID: {comment_id}
+Title: {comment_title}
+Body:
+{comment_text}
+
+---
+Classify this comment per the instructions. Return only JSON.\
+"""
+
+PROMPT_VERSION = hashlib.sha256(
+    (SYSTEM_PROMPT + USER_TEMPLATE).encode("utf-8")
+).hexdigest()[:7]
+
+MAX_COMMENT_CHARS = 6000
+_RETRY_DELAYS = [1.0, 2.0]  # delays before attempt 2 and 3
+
+# ---------------------------------------------------------------------------
+# Core functions (importable for tests)
+
+
+def load_items(path: Path) -> tuple[dict | None, list[dict]]:
+    """Read a scraped JSONL file. Returns (forum_item_or_None, [comment_items])."""
+    forum = None
+    comments = []
+    with open(path, encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            item = json.loads(line)
+            if "comment_id" in item:
+                comments.append(item)
+            elif "reg_title" in item:
+                forum = item
+    return forum, comments
+
+
+def build_messages(comment: dict, forum: dict | None) -> tuple[list, bool]:
+    """Build the OpenAI messages list for one comment.
+
+    Returns (messages, truncated) where truncated is True if the comment body
+    was cut to MAX_COMMENT_CHARS.
+    """
+    reg_title = (forum or {}).get("reg_title", "[unknown]")
+    reg_desc  = (forum or {}).get("reg_desc",  "[unknown]")
+
+    body = (comment.get("text") or "").strip()
+    truncated = False
+    if not body:
+        body = "[No body text provided]"
+    elif len(body) > MAX_COMMENT_CHARS:
+        body = body[:MAX_COMMENT_CHARS] + "... [truncated]"
+        truncated = True
+
+    user_text = USER_TEMPLATE.format(
+        reg_title=reg_title,
+        reg_desc=reg_desc,
+        comment_id=comment.get("comment_id", ""),
+        comment_title=comment.get("title", ""),
+        comment_text=body,
+    )
+
+    return [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user",   "content": user_text},
+    ], truncated
+
+
+def _call_api(client, messages: list, model: str) -> str:
+    """Call the OpenAI chat API with exponential-backoff retry on rate limits."""
+    last_exc = None
+    for delay in [0.0] + _RETRY_DELAYS:
+        if delay:
+            time.sleep(delay)
+        try:
+            resp = client.chat.completions.create(
+                model=model,
+                messages=messages,
+                response_format={"type": "json_object"},
+                temperature=0.0,
+            )
+            return resp.choices[0].message.content
+        except openai.RateLimitError as exc:
+            last_exc = exc
+    raise last_exc  # type: ignore[misc]
+
+
+def parse_api_response(content: str) -> dict:
+    """Parse the model's JSON response, returning only the expected keys."""
+    data = json.loads(content)
+    keys = ("stance", "stance_confidence", "stance_rationale", "tone", "tags")
+    return {k: data.get(k) for k in keys}
+
+
+def analyze_comment(
+    client,
+    comment: dict,
+    forum: dict | None,
+    run_id: str,
+    model: str,
+) -> dict:
+    """Analyze one comment and return a fully-formed output record."""
+    base = {
+        "run_id":         run_id,
+        "forum_id":       comment.get("forum_id", ""),
+        "comment_id":     comment.get("comment_id", ""),
+        "analyzed_at":    datetime.now(timezone.utc).isoformat(),
+        "model":          model,
+        "prompt_version": PROMPT_VERSION,
+        "input_title":    comment.get("title", ""),
+    }
+    try:
+        messages, truncated = build_messages(comment, forum)
+        content = _call_api(client, messages, model)
+        parsed = parse_api_response(content)
+        return {**base, **parsed, "truncated": truncated, "error": None}
+    except Exception as exc:
+        return {
+            **base,
+            "stance": None, "stance_confidence": None,
+            "stance_rationale": None, "tone": None, "tags": None,
+            "truncated": False,
+            "error": str(exc),
+        }
+
+
+def _scrape_ts_from_filename(path: Path) -> str:
+    """Extract the timestamp from a scraped JSONL filename for use in the output name."""
+    m = re.search(r"(\d{4}-\d{2}-\d{2}T[\d\-+:]+)", path.stem)
+    return m.group(1).replace(":", "-") if m else "unknown"
+
+
+# ---------------------------------------------------------------------------
+# CLI
+
+
+def main() -> None:
+    load_dotenv()
+
+    parser = argparse.ArgumentParser(
+        description="Analyze VA Townhall public comments with GPT-4o.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument("input", help="Path to scraped JSONL file")
+    parser.add_argument(
+        "--limit",
+        type=int,
+        choices=[5, 10, 20, 50],
+        metavar="{5,10,20,50}",
+        help="Process only the first N comments (for testing). Omit to process all.",
+    )
+    parser.add_argument(
+        "--model",
+        default="gpt-4o",
+        help="OpenAI model name (default: gpt-4o)",
+    )
+    args = parser.parse_args()
+
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        sys.exit("OPENAI_API_KEY not set. Create a .env file or export the variable.")
+
+    input_path = Path(args.input)
+    if not input_path.exists():
+        sys.exit(f"File not found: {input_path}")
+
+    print(f"Reading {input_path} ...", file=sys.stderr)
+    forum, comments = load_items(input_path)
+
+    if forum is None:
+        print(
+            "Warning: no ForumItem found in file — regulation context will be [unknown].",
+            file=sys.stderr,
+        )
+
+    if args.limit:
+        comments = comments[: args.limit]
+
+    forum_id   = (forum or {}).get("forum_id", "unknown")
+    scrape_ts  = _scrape_ts_from_filename(input_path)
+    run_ts     = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%S+00-00")
+    model_slug = args.model.replace("/", "-")
+
+    out_dir  = Path(__file__).parent
+    out_path = out_dir / f"forum{forum_id}_{scrape_ts}_{model_slug}_{run_ts}.jsonl"
+
+    run_id = str(uuid.uuid4())
+    client = openai.OpenAI(api_key=api_key)
+
+    n_ok = n_err = 0
+    total = len(comments)
+    print(f"Analyzing {total} comments → {out_path}", file=sys.stderr)
+
+    with open(out_path, "w", encoding="utf-8") as out:
+        for i, comment in enumerate(comments, 1):
+            record = analyze_comment(client, comment, forum, run_id, args.model)
+            out.write(json.dumps(record, ensure_ascii=False) + "\n")
+            out.flush()
+            if record["error"]:
+                n_err += 1
+                print(
+                    f"  [{i}/{total}] ERROR {comment.get('comment_id')}: {record['error']}",
+                    file=sys.stderr,
+                )
+            else:
+                n_ok += 1
+                print(
+                    f"  [{i}/{total}] OK    {comment.get('comment_id')} → {record['stance']}",
+                    file=sys.stderr,
+                )
+            time.sleep(0.1)
+
+    print(f"\nDone. {n_ok} ok, {n_err} errors → {out_path}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()