added 4o initial manual analysis and test

2026-05-05 15:00:34 -04:00
parent c8017c908d
commit d834d18c81
5 changed files with 540 additions and 3 deletions
--- a/analysis/gpt4o/init.py
+++ b/analysis/gpt4o/init.py
--- a/analysis/gpt4o/analysis.py
+++ b/analysis/gpt4o/analysis.py
@@ -0,0 +1,283 @@
+#!/usr/bin/env python3
+"""
+analysis/gpt4o/analysis.py — Manual GPT-4o sentiment pipeline for VA Townhall comments.
+
+Usage:
+    python analysis/gpt4o/analysis.py <input_jsonl> [--limit {5,10,20,50}] [--model MODEL]
+
+Output:
+    analysis/gpt4o/forum{id}_{scrape_ts}_{model}_{run_ts}.jsonl
+"""
+
+import argparse
+import hashlib
+import json
+import os
+import re
+import sys
+import time
+import uuid
+from datetime import datetime, timezone
+from pathlib import Path
+
+from dotenv import load_dotenv
+
+try:
+    import openai
+except ImportError:
+    sys.exit("openai package not installed. Run: pip install openai")
+
+# ---------------------------------------------------------------------------
+# Prompt (version is derived from the content — changing either string changes PROMPT_VERSION)
+
+SYSTEM_PROMPT = """\
+You are an expert policy analyst classifying public comments submitted to the Virginia Town Hall
+regulatory comment system. You will be given the text of a proposed regulation and a single
+public comment. Return ONLY a JSON object — no other text.
+
+Definitions:
+- stance: the commenter's position on whether the regulation should be adopted.
+  "support" = wants it approved (as-is or with changes);
+  "oppose"  = wants it rejected or substantially weakened;
+  "neutral" = takes no position, asks a question, or provides factual input only;
+  "unknown" = too vague, off-topic, or uninterpretable to classify.
+- tone: the emotional register of the writing, independent of stance.
+  "positive" = affirming, hopeful, appreciative;
+  "negative" = angry, fearful, alarmed, or contemptuous;
+  "neutral"  = matter-of-fact, procedural, or informational;
+  "mixed"    = contains both positive and negative emotional content;
+  "unclear"  = tone cannot be determined (e.g., a one-word comment).
+- stance_confidence: float 0.0–1.0, your confidence in the stance label.
+- stance_rationale: 1–3 sentences explaining the key evidence; quote specific phrases where possible.
+- tags: up to 5 short topic labels relevant to the comment's specific concerns (e.g.
+  "parental rights", "student safety", "privacy", "religious freedom", "LGBTQ+ inclusion",
+  "bullying prevention", "school sports", "bathroom access"). Empty array if none apply.
+
+Return exactly these keys: stance, stance_confidence, stance_rationale, tone, tags.\
+"""
+
+USER_TEMPLATE = """\
+## Proposed Regulation
+Title: {reg_title}
+Description: {reg_desc}
+
+---
+
+## Public Comment
+Comment ID: {comment_id}
+Title: {comment_title}
+Body:
+{comment_text}
+
+---
+Classify this comment per the instructions. Return only JSON.\
+"""
+
+PROMPT_VERSION = hashlib.sha256(
+    (SYSTEM_PROMPT + USER_TEMPLATE).encode("utf-8")
+).hexdigest()[:7]
+
+MAX_COMMENT_CHARS = 6000
+_RETRY_DELAYS = [1.0, 2.0]  # delays before attempt 2 and 3
+
+# ---------------------------------------------------------------------------
+# Core functions (importable for tests)
+
+
+def load_items(path: Path) -> tuple[dict | None, list[dict]]:
+    """Read a scraped JSONL file. Returns (forum_item_or_None, [comment_items])."""
+    forum = None
+    comments = []
+    with open(path, encoding="utf-8") as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            item = json.loads(line)
+            if "comment_id" in item:
+                comments.append(item)
+            elif "reg_title" in item:
+                forum = item
+    return forum, comments
+
+
+def build_messages(comment: dict, forum: dict | None) -> tuple[list, bool]:
+    """Build the OpenAI messages list for one comment.
+
+    Returns (messages, truncated) where truncated is True if the comment body
+    was cut to MAX_COMMENT_CHARS.
+    """
+    reg_title = (forum or {}).get("reg_title", "[unknown]")
+    reg_desc  = (forum or {}).get("reg_desc",  "[unknown]")
+
+    body = (comment.get("text") or "").strip()
+    truncated = False
+    if not body:
+        body = "[No body text provided]"
+    elif len(body) > MAX_COMMENT_CHARS:
+        body = body[:MAX_COMMENT_CHARS] + "... [truncated]"
+        truncated = True
+
+    user_text = USER_TEMPLATE.format(
+        reg_title=reg_title,
+        reg_desc=reg_desc,
+        comment_id=comment.get("comment_id", ""),
+        comment_title=comment.get("title", ""),
+        comment_text=body,
+    )
+
+    return [
+        {"role": "system", "content": SYSTEM_PROMPT},
+        {"role": "user",   "content": user_text},
+    ], truncated
+
+
+def _call_api(client, messages: list, model: str) -> str:
+    """Call the OpenAI chat API with exponential-backoff retry on rate limits."""
+    last_exc = None
+    for delay in [0.0] + _RETRY_DELAYS:
+        if delay:
+            time.sleep(delay)
+        try:
+            resp = client.chat.completions.create(
+                model=model,
+                messages=messages,
+                response_format={"type": "json_object"},
+                temperature=0.0,
+            )
+            return resp.choices[0].message.content
+        except openai.RateLimitError as exc:
+            last_exc = exc
+    raise last_exc  # type: ignore[misc]
+
+
+def parse_api_response(content: str) -> dict:
+    """Parse the model's JSON response, returning only the expected keys."""
+    data = json.loads(content)
+    keys = ("stance", "stance_confidence", "stance_rationale", "tone", "tags")
+    return {k: data.get(k) for k in keys}
+
+
+def analyze_comment(
+    client,
+    comment: dict,
+    forum: dict | None,
+    run_id: str,
+    model: str,
+) -> dict:
+    """Analyze one comment and return a fully-formed output record."""
+    base = {
+        "run_id":         run_id,
+        "forum_id":       comment.get("forum_id", ""),
+        "comment_id":     comment.get("comment_id", ""),
+        "analyzed_at":    datetime.now(timezone.utc).isoformat(),
+        "model":          model,
+        "prompt_version": PROMPT_VERSION,
+        "input_title":    comment.get("title", ""),
+    }
+    try:
+        messages, truncated = build_messages(comment, forum)
+        content = _call_api(client, messages, model)
+        parsed = parse_api_response(content)
+        return {**base, **parsed, "truncated": truncated, "error": None}
+    except Exception as exc:
+        return {
+            **base,
+            "stance": None, "stance_confidence": None,
+            "stance_rationale": None, "tone": None, "tags": None,
+            "truncated": False,
+            "error": str(exc),
+        }
+
+
+def _scrape_ts_from_filename(path: Path) -> str:
+    """Extract the timestamp from a scraped JSONL filename for use in the output name."""
+    m = re.search(r"(\d{4}-\d{2}-\d{2}T[\d\-+:]+)", path.stem)
+    return m.group(1).replace(":", "-") if m else "unknown"
+
+
+# ---------------------------------------------------------------------------
+# CLI
+
+
+def main() -> None:
+    load_dotenv()
+
+    parser = argparse.ArgumentParser(
+        description="Analyze VA Townhall public comments with GPT-4o.",
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument("input", help="Path to scraped JSONL file")
+    parser.add_argument(
+        "--limit",
+        type=int,
+        choices=[5, 10, 20, 50],
+        metavar="{5,10,20,50}",
+        help="Process only the first N comments (for testing). Omit to process all.",
+    )
+    parser.add_argument(
+        "--model",
+        default="gpt-4o",
+        help="OpenAI model name (default: gpt-4o)",
+    )
+    args = parser.parse_args()
+
+    api_key = os.environ.get("OPENAI_API_KEY")
+    if not api_key:
+        sys.exit("OPENAI_API_KEY not set. Create a .env file or export the variable.")
+
+    input_path = Path(args.input)
+    if not input_path.exists():
+        sys.exit(f"File not found: {input_path}")
+
+    print(f"Reading {input_path} ...", file=sys.stderr)
+    forum, comments = load_items(input_path)
+
+    if forum is None:
+        print(
+            "Warning: no ForumItem found in file — regulation context will be [unknown].",
+            file=sys.stderr,
+        )
+
+    if args.limit:
+        comments = comments[: args.limit]
+
+    forum_id   = (forum or {}).get("forum_id", "unknown")
+    scrape_ts  = _scrape_ts_from_filename(input_path)
+    run_ts     = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%S+00-00")
+    model_slug = args.model.replace("/", "-")
+
+    out_dir  = Path(__file__).parent
+    out_path = out_dir / f"forum{forum_id}_{scrape_ts}_{model_slug}_{run_ts}.jsonl"
+
+    run_id = str(uuid.uuid4())
+    client = openai.OpenAI(api_key=api_key)
+
+    n_ok = n_err = 0
+    total = len(comments)
+    print(f"Analyzing {total} comments → {out_path}", file=sys.stderr)
+
+    with open(out_path, "w", encoding="utf-8") as out:
+        for i, comment in enumerate(comments, 1):
+            record = analyze_comment(client, comment, forum, run_id, args.model)
+            out.write(json.dumps(record, ensure_ascii=False) + "\n")
+            out.flush()
+            if record["error"]:
+                n_err += 1
+                print(
+                    f"  [{i}/{total}] ERROR {comment.get('comment_id')}: {record['error']}",
+                    file=sys.stderr,
+                )
+            else:
+                n_ok += 1
+                print(
+                    f"  [{i}/{total}] OK    {comment.get('comment_id')} → {record['stance']}",
+                    file=sys.stderr,
+                )
+            time.sleep(0.1)
+
+    print(f"\nDone. {n_ok} ok, {n_err} errors → {out_path}", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
--- a/analysis/gpt4o/forum452_unknown_gpt-4o_2026-05-05T18-48-32+00-00.jsonl
+++ b/analysis/gpt4o/forum452_unknown_gpt-4o_2026-05-05T18-48-32+00-00.jsonl
@@ -0,0 +1,5 @@
+{"run_id": "a513d1c5-88a4-40b6-8d60-be255908a157", "forum_id": "452", "comment_id": "87914", "analyzed_at": "2026-05-05T18:48:32.792363+00:00", "model": "gpt-4o", "prompt_version": "17ff34f", "input_title": "Support the Model Policy Wholeheartedly", "stance": "support", "stance_confidence": 1.0, "stance_rationale": "The commenter explicitly states 'I support the model policy wholeheartedly' and praises the policy for creating inclusive and welcoming schools for transgender and non-binary students. They also express gratitude towards the Virginia Department of Education for developing the policy.", "tone": "positive", "tags": ["LGBTQ+ inclusion", "student safety", "privacy", "bullying prevention"], "truncated": false, "error": null}
+{"run_id": "a513d1c5-88a4-40b6-8d60-be255908a157", "forum_id": "452", "comment_id": "87915", "analyzed_at": "2026-05-05T18:48:37.398155+00:00", "model": "gpt-4o", "prompt_version": "17ff34f", "input_title": "Please support this vital policy", "stance": "support", "stance_confidence": 1.0, "stance_rationale": "The commenter explicitly states, 'I strongly support these proposals,' indicating clear approval of the regulation. They also affirm the importance of treating every student with dignity and respect, aligning with the policy's goals.", "tone": "positive", "tags": ["LGBTQ+ inclusion", "student safety", "nondiscrimination"], "truncated": false, "error": null}
+{"run_id": "a513d1c5-88a4-40b6-8d60-be255908a157", "forum_id": "452", "comment_id": "87916", "analyzed_at": "2026-05-05T18:48:41.236389+00:00", "model": "gpt-4o", "prompt_version": "17ff34f", "input_title": "Please support this policy", "stance": "support", "stance_confidence": 1.0, "stance_rationale": "The commenter explicitly states, \"I am in full support of this policy guidance,\" indicating clear support for the regulation. The phrase \"Trans rights are human rights\" further reinforces their supportive stance.", "tone": "positive", "tags": ["transgender rights", "nondiscrimination"], "truncated": false, "error": null}
+{"run_id": "a513d1c5-88a4-40b6-8d60-be255908a157", "forum_id": "452", "comment_id": "87917", "analyzed_at": "2026-05-05T18:48:44.321705+00:00", "model": "gpt-4o", "prompt_version": "17ff34f", "input_title": "Please support this policy", "stance": "support", "stance_confidence": 0.95, "stance_rationale": "The commenter explicitly states 'Please support this policy' and 'Please implement this policy,' indicating a clear support for the adoption of the regulation.", "tone": "positive", "tags": ["transgender rights", "student safety", "nondiscrimination"], "truncated": false, "error": null}
+{"run_id": "a513d1c5-88a4-40b6-8d60-be255908a157", "forum_id": "452", "comment_id": "87918", "analyzed_at": "2026-05-05T18:48:47.920316+00:00", "model": "gpt-4o", "prompt_version": "17ff34f", "input_title": "An Essential Policy", "stance": "support", "stance_confidence": 1.0, "stance_rationale": "The commenter explicitly states 'I fully support this policy,' indicating clear approval of the regulation. They also describe it as 'essential for the health and wellbeing of our students and of our community,' reinforcing their supportive stance.", "tone": "positive", "tags": ["student wellbeing", "community support"], "truncated": false, "error": null}