Compare commits

...

2 Commits

Author SHA1 Message Date
490c642bd9 added timestamp to tasks 2026-05-05 15:03:25 -04:00
d834d18c81 added 4o initial manual analysis and test 2026-05-05 15:00:34 -04:00
6 changed files with 545 additions and 5 deletions

View File

@@ -36,5 +36,5 @@ Description and PM notes
** evidence ** evidence
- commit: - commit:
- tests: - tests:
- datetime: - date: [2026-05-05 Tue 15:00]
``` ```

View File

283
analysis/gpt4o/analysis.py Normal file
View File

@@ -0,0 +1,283 @@
#!/usr/bin/env python3
"""
analysis/gpt4o/analysis.py — Manual GPT-4o sentiment pipeline for VA Townhall comments.
Usage:
python analysis/gpt4o/analysis.py <input_jsonl> [--limit {5,10,20,50}] [--model MODEL]
Output:
analysis/gpt4o/forum{id}_{scrape_ts}_{model}_{run_ts}.jsonl
"""
import argparse
import hashlib
import json
import os
import re
import sys
import time
import uuid
from datetime import datetime, timezone
from pathlib import Path
from dotenv import load_dotenv
try:
import openai
except ImportError:
sys.exit("openai package not installed. Run: pip install openai")
# ---------------------------------------------------------------------------
# Prompt (version is derived from the content — changing either string changes PROMPT_VERSION)
SYSTEM_PROMPT = """\
You are an expert policy analyst classifying public comments submitted to the Virginia Town Hall
regulatory comment system. You will be given the text of a proposed regulation and a single
public comment. Return ONLY a JSON object — no other text.
Definitions:
- stance: the commenter's position on whether the regulation should be adopted.
"support" = wants it approved (as-is or with changes);
"oppose" = wants it rejected or substantially weakened;
"neutral" = takes no position, asks a question, or provides factual input only;
"unknown" = too vague, off-topic, or uninterpretable to classify.
- tone: the emotional register of the writing, independent of stance.
"positive" = affirming, hopeful, appreciative;
"negative" = angry, fearful, alarmed, or contemptuous;
"neutral" = matter-of-fact, procedural, or informational;
"mixed" = contains both positive and negative emotional content;
"unclear" = tone cannot be determined (e.g., a one-word comment).
- stance_confidence: float 0.01.0, your confidence in the stance label.
- stance_rationale: 13 sentences explaining the key evidence; quote specific phrases where possible.
- tags: up to 5 short topic labels relevant to the comment's specific concerns (e.g.
"parental rights", "student safety", "privacy", "religious freedom", "LGBTQ+ inclusion",
"bullying prevention", "school sports", "bathroom access"). Empty array if none apply.
Return exactly these keys: stance, stance_confidence, stance_rationale, tone, tags.\
"""
USER_TEMPLATE = """\
## Proposed Regulation
Title: {reg_title}
Description: {reg_desc}
---
## Public Comment
Comment ID: {comment_id}
Title: {comment_title}
Body:
{comment_text}
---
Classify this comment per the instructions. Return only JSON.\
"""
PROMPT_VERSION = hashlib.sha256(
(SYSTEM_PROMPT + USER_TEMPLATE).encode("utf-8")
).hexdigest()[:7]
MAX_COMMENT_CHARS = 6000
_RETRY_DELAYS = [1.0, 2.0] # delays before attempt 2 and 3
# ---------------------------------------------------------------------------
# Core functions (importable for tests)
def load_items(path: Path) -> tuple[dict | None, list[dict]]:
"""Read a scraped JSONL file. Returns (forum_item_or_None, [comment_items])."""
forum = None
comments = []
with open(path, encoding="utf-8") as f:
for line in f:
line = line.strip()
if not line:
continue
item = json.loads(line)
if "comment_id" in item:
comments.append(item)
elif "reg_title" in item:
forum = item
return forum, comments
def build_messages(comment: dict, forum: dict | None) -> tuple[list, bool]:
"""Build the OpenAI messages list for one comment.
Returns (messages, truncated) where truncated is True if the comment body
was cut to MAX_COMMENT_CHARS.
"""
reg_title = (forum or {}).get("reg_title", "[unknown]")
reg_desc = (forum or {}).get("reg_desc", "[unknown]")
body = (comment.get("text") or "").strip()
truncated = False
if not body:
body = "[No body text provided]"
elif len(body) > MAX_COMMENT_CHARS:
body = body[:MAX_COMMENT_CHARS] + "... [truncated]"
truncated = True
user_text = USER_TEMPLATE.format(
reg_title=reg_title,
reg_desc=reg_desc,
comment_id=comment.get("comment_id", ""),
comment_title=comment.get("title", ""),
comment_text=body,
)
return [
{"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": user_text},
], truncated
def _call_api(client, messages: list, model: str) -> str:
"""Call the OpenAI chat API with exponential-backoff retry on rate limits."""
last_exc = None
for delay in [0.0] + _RETRY_DELAYS:
if delay:
time.sleep(delay)
try:
resp = client.chat.completions.create(
model=model,
messages=messages,
response_format={"type": "json_object"},
temperature=0.0,
)
return resp.choices[0].message.content
except openai.RateLimitError as exc:
last_exc = exc
raise last_exc # type: ignore[misc]
def parse_api_response(content: str) -> dict:
"""Parse the model's JSON response, returning only the expected keys."""
data = json.loads(content)
keys = ("stance", "stance_confidence", "stance_rationale", "tone", "tags")
return {k: data.get(k) for k in keys}
def analyze_comment(
client,
comment: dict,
forum: dict | None,
run_id: str,
model: str,
) -> dict:
"""Analyze one comment and return a fully-formed output record."""
base = {
"run_id": run_id,
"forum_id": comment.get("forum_id", ""),
"comment_id": comment.get("comment_id", ""),
"analyzed_at": datetime.now(timezone.utc).isoformat(),
"model": model,
"prompt_version": PROMPT_VERSION,
"input_title": comment.get("title", ""),
}
try:
messages, truncated = build_messages(comment, forum)
content = _call_api(client, messages, model)
parsed = parse_api_response(content)
return {**base, **parsed, "truncated": truncated, "error": None}
except Exception as exc:
return {
**base,
"stance": None, "stance_confidence": None,
"stance_rationale": None, "tone": None, "tags": None,
"truncated": False,
"error": str(exc),
}
def _scrape_ts_from_filename(path: Path) -> str:
"""Extract the timestamp from a scraped JSONL filename for use in the output name."""
m = re.search(r"(\d{4}-\d{2}-\d{2}T[\d\-+:]+)", path.stem)
return m.group(1).replace(":", "-") if m else "unknown"
# ---------------------------------------------------------------------------
# CLI
def main() -> None:
load_dotenv()
parser = argparse.ArgumentParser(
description="Analyze VA Townhall public comments with GPT-4o.",
formatter_class=argparse.RawDescriptionHelpFormatter,
)
parser.add_argument("input", help="Path to scraped JSONL file")
parser.add_argument(
"--limit",
type=int,
choices=[5, 10, 20, 50],
metavar="{5,10,20,50}",
help="Process only the first N comments (for testing). Omit to process all.",
)
parser.add_argument(
"--model",
default="gpt-4o",
help="OpenAI model name (default: gpt-4o)",
)
args = parser.parse_args()
api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
sys.exit("OPENAI_API_KEY not set. Create a .env file or export the variable.")
input_path = Path(args.input)
if not input_path.exists():
sys.exit(f"File not found: {input_path}")
print(f"Reading {input_path} ...", file=sys.stderr)
forum, comments = load_items(input_path)
if forum is None:
print(
"Warning: no ForumItem found in file — regulation context will be [unknown].",
file=sys.stderr,
)
if args.limit:
comments = comments[: args.limit]
forum_id = (forum or {}).get("forum_id", "unknown")
scrape_ts = _scrape_ts_from_filename(input_path)
run_ts = datetime.now(timezone.utc).strftime("%Y-%m-%dT%H-%M-%S+00-00")
model_slug = args.model.replace("/", "-")
out_dir = Path(__file__).parent
out_path = out_dir / f"forum{forum_id}_{scrape_ts}_{model_slug}_{run_ts}.jsonl"
run_id = str(uuid.uuid4())
client = openai.OpenAI(api_key=api_key)
n_ok = n_err = 0
total = len(comments)
print(f"Analyzing {total} comments → {out_path}", file=sys.stderr)
with open(out_path, "w", encoding="utf-8") as out:
for i, comment in enumerate(comments, 1):
record = analyze_comment(client, comment, forum, run_id, args.model)
out.write(json.dumps(record, ensure_ascii=False) + "\n")
out.flush()
if record["error"]:
n_err += 1
print(
f" [{i}/{total}] ERROR {comment.get('comment_id')}: {record['error']}",
file=sys.stderr,
)
else:
n_ok += 1
print(
f" [{i}/{total}] OK {comment.get('comment_id')}{record['stance']}",
file=sys.stderr,
)
time.sleep(0.1)
print(f"\nDone. {n_ok} ok, {n_err} errors → {out_path}", file=sys.stderr)
if __name__ == "__main__":
main()

View File

@@ -0,0 +1,5 @@
{"run_id": "a513d1c5-88a4-40b6-8d60-be255908a157", "forum_id": "452", "comment_id": "87914", "analyzed_at": "2026-05-05T18:48:32.792363+00:00", "model": "gpt-4o", "prompt_version": "17ff34f", "input_title": "Support the Model Policy Wholeheartedly", "stance": "support", "stance_confidence": 1.0, "stance_rationale": "The commenter explicitly states 'I support the model policy wholeheartedly' and praises the policy for creating inclusive and welcoming schools for transgender and non-binary students. They also express gratitude towards the Virginia Department of Education for developing the policy.", "tone": "positive", "tags": ["LGBTQ+ inclusion", "student safety", "privacy", "bullying prevention"], "truncated": false, "error": null}
{"run_id": "a513d1c5-88a4-40b6-8d60-be255908a157", "forum_id": "452", "comment_id": "87915", "analyzed_at": "2026-05-05T18:48:37.398155+00:00", "model": "gpt-4o", "prompt_version": "17ff34f", "input_title": "Please support this vital policy", "stance": "support", "stance_confidence": 1.0, "stance_rationale": "The commenter explicitly states, 'I strongly support these proposals,' indicating clear approval of the regulation. They also affirm the importance of treating every student with dignity and respect, aligning with the policy's goals.", "tone": "positive", "tags": ["LGBTQ+ inclusion", "student safety", "nondiscrimination"], "truncated": false, "error": null}
{"run_id": "a513d1c5-88a4-40b6-8d60-be255908a157", "forum_id": "452", "comment_id": "87916", "analyzed_at": "2026-05-05T18:48:41.236389+00:00", "model": "gpt-4o", "prompt_version": "17ff34f", "input_title": "Please support this policy", "stance": "support", "stance_confidence": 1.0, "stance_rationale": "The commenter explicitly states, \"I am in full support of this policy guidance,\" indicating clear support for the regulation. The phrase \"Trans rights are human rights\" further reinforces their supportive stance.", "tone": "positive", "tags": ["transgender rights", "nondiscrimination"], "truncated": false, "error": null}
{"run_id": "a513d1c5-88a4-40b6-8d60-be255908a157", "forum_id": "452", "comment_id": "87917", "analyzed_at": "2026-05-05T18:48:44.321705+00:00", "model": "gpt-4o", "prompt_version": "17ff34f", "input_title": "Please support this policy", "stance": "support", "stance_confidence": 0.95, "stance_rationale": "The commenter explicitly states 'Please support this policy' and 'Please implement this policy,' indicating a clear support for the adoption of the regulation.", "tone": "positive", "tags": ["transgender rights", "student safety", "nondiscrimination"], "truncated": false, "error": null}
{"run_id": "a513d1c5-88a4-40b6-8d60-be255908a157", "forum_id": "452", "comment_id": "87918", "analyzed_at": "2026-05-05T18:48:47.920316+00:00", "model": "gpt-4o", "prompt_version": "17ff34f", "input_title": "An Essential Policy", "stance": "support", "stance_confidence": 1.0, "stance_rationale": "The commenter explicitly states 'I fully support this policy,' indicating clear approval of the regulation. They also describe it as 'essential for the health and wellbeing of our students and of our community,' reinforcing their supportive stance.", "tone": "positive", "tags": ["student wellbeing", "community support"], "truncated": false, "error": null}

View File

@@ -31,7 +31,7 @@ Comments are hydrated in backend via js-cued button (AJAX?).
- tests: 8 passing (`python -m pytest tests -q`) or (`python -m pytest tests/`) - tests: 8 passing (`python -m pytest tests -q`) or (`python -m pytest tests/`)
- `scrapy crawl forum -a forum_id=452 -s LOG_LEVEL=WARNING 2>&1` - `scrapy crawl forum -a forum_id=452 -s LOG_LEVEL=WARNING 2>&1`
- retrieved 9083 comments - retrieved 9083 comments
- datetime: 2026-05-05 - datetime: [2026-05-05 Tue 14:00]
* [ ] t1.2: initial 4o sentiment * [ ] t1.2: initial 4o sentiment
Write a simple manual pipeline for gpt-4o that reads one scraped forum jsonl file and roduces a separate analyzed jsonl file. this step must not mutate scraper output. analysis should classify each comment for regulatory stance, generic tone/sentiment, confidence, and enough rationale/evidence to support later dashboard drilldown. Write a simple manual pipeline for gpt-4o that reads one scraped forum jsonl file and roduces a separate analyzed jsonl file. this step must not mutate scraper output. analysis should classify each comment for regulatory stance, generic tone/sentiment, confidence, and enough rationale/evidence to support later dashboard drilldown.
@@ -53,13 +53,35 @@ Should be run manually, separate from scraper. You may use scrapy, but are not r
5. capture issue/topic tags for later grouping, may be empty 5. capture issue/topic tags for later grouping, may be empty
6. use .env for api key management 6. use .env for api key management
7. document the exact prompt version used; prompt text may live in code or docs, but must have a version string/hash in output records 7. document the exact prompt version used; prompt text may live in code or docs, but must have a version string/hash in output records
8. for this run, an option to run the first N comments (5, 10, 20, 50) - will add batch processing later
** notes
- analysis/gpt4o/analysis.py: standalone script; core functions importable for tests.
- Prompt version = SHA-256[:7] of SYSTEM_PROMPT+USER_TEMPLATE; auto-updates on prompt change.
- Output: analysis/gpt4o/forum{id}_{scrape_ts}_{model}_{run_ts}.jsonl, one record per comment.
- --limit {5,10,20,50} for test runs; omit for full corpus. Batch processing planned for later.
- Incremental flush after each record: safe to interrupt and inspect partial output.
- temperature=0.0 for deterministic, reproducible classifications across runs.
- Retry: 3 attempts (delays 1s, 2s) on RateLimitError; all other exceptions → error record + continue.
- openai==2.34.0 installed; python-dotenv already present; key loaded from .env via OPENAI_API_KEY.
- MAX_COMMENT_CHARS=6000: covers >99% without truncation; outliers (e.g. 18k-char law firm brief) flagged with truncated=True.
** evidence
- commit: d834d18
- tests: 20 passing (pytest tests/test_gpt4o_analysis.py), 28 total across suite
python ./analysis/gpt4o/analysis.py --limit 5 ./output/f452.jsonl
- date: [2026-05-05 Tue 15:00]
* [ ] t1.2.1: 4o with batch processing
** acceptance criteria
1. input scraped jsonl doc by filename/path, and process the whole thing via batch processing
** notes ** notes
** evidence ** evidence
- commit: - commit:
- tests: - tests:
- date: - date:
* [ ] X: complete proposal information * [ ] X: complete proposal information
Ensure we capture as much useful information as possible about the actual proposal - contact information, etc. what the state actually says about what was posted. Ensure we capture as much useful information as possible about the actual proposal - contact information, etc. what the state actually says about what was posted.

View File

@@ -0,0 +1,230 @@
"""Unit tests for analysis/gpt4o/analysis.py — no real API calls."""
import json
import sys
from pathlib import Path
from unittest.mock import MagicMock, patch
import pytest
# Make the module importable without installing as a package
sys.path.insert(0, str(Path(__file__).parent.parent / "analysis" / "gpt4o"))
import analysis as gpt4o
# ---------------------------------------------------------------------------
# Fixtures
FORUM_ITEM = {
"forum_id": "452",
"reg_title": "Model Policies for Transgender Students",
"reg_desc": "Guidance developed in response to HB 145.",
}
COMMENT_ITEM = {
"forum_id": "452",
"comment_id": "87914",
"author": "Alice Example",
"date": "2021-01-04T09:15:00",
"title": "I support this policy",
"text": "This is a great policy that protects students.",
}
MOCK_RESPONSE_CONTENT = json.dumps({
"stance": "support",
"stance_confidence": 0.95,
"stance_rationale": "Commenter explicitly endorses the policy.",
"tone": "positive",
"tags": ["student safety", "LGBTQ+ inclusion"],
})
def _mock_client(response_content: str = MOCK_RESPONSE_CONTENT):
client = MagicMock()
choice = MagicMock()
choice.message.content = response_content
client.chat.completions.create.return_value = MagicMock(choices=[choice])
return client
# ---------------------------------------------------------------------------
# Prompt versioning
def test_prompt_version_is_7_hex_chars():
assert len(gpt4o.PROMPT_VERSION) == 7
assert all(c in "0123456789abcdef" for c in gpt4o.PROMPT_VERSION)
def test_prompt_version_changes_with_system_prompt():
import hashlib
alt = hashlib.sha256(("CHANGED" + gpt4o.USER_TEMPLATE).encode("utf-8")).hexdigest()[:7]
assert alt != gpt4o.PROMPT_VERSION
def test_prompt_version_is_stable():
import hashlib
v2 = hashlib.sha256(
(gpt4o.SYSTEM_PROMPT + gpt4o.USER_TEMPLATE).encode("utf-8")
).hexdigest()[:7]
assert v2 == gpt4o.PROMPT_VERSION
# ---------------------------------------------------------------------------
# Item detection via load_items
def test_load_items_separates_forum_and_comments(tmp_path):
jsonl = tmp_path / "test.jsonl"
jsonl.write_text(
json.dumps(FORUM_ITEM) + "\n" + json.dumps(COMMENT_ITEM) + "\n",
encoding="utf-8",
)
forum, comments = gpt4o.load_items(jsonl)
assert forum is not None
assert forum["reg_title"] == FORUM_ITEM["reg_title"]
assert len(comments) == 1
assert comments[0]["comment_id"] == "87914"
def test_load_items_no_forum(tmp_path):
jsonl = tmp_path / "test.jsonl"
jsonl.write_text(json.dumps(COMMENT_ITEM) + "\n", encoding="utf-8")
forum, comments = gpt4o.load_items(jsonl)
assert forum is None
assert len(comments) == 1
def test_load_items_skips_blank_lines(tmp_path):
jsonl = tmp_path / "test.jsonl"
jsonl.write_text(
"\n" + json.dumps(COMMENT_ITEM) + "\n\n",
encoding="utf-8",
)
_, comments = gpt4o.load_items(jsonl)
assert len(comments) == 1
# ---------------------------------------------------------------------------
# build_messages
def test_truncation_applied():
long_comment = {**COMMENT_ITEM, "text": "x" * 7000}
messages, truncated = gpt4o.build_messages(long_comment, FORUM_ITEM)
assert truncated is True
user_content = messages[1]["content"]
assert "... [truncated]" in user_content
# The x's in the prompt must not exceed MAX_COMMENT_CHARS
x_count = user_content.count("x")
assert x_count == gpt4o.MAX_COMMENT_CHARS
def test_no_truncation_for_short_comment():
_, truncated = gpt4o.build_messages(COMMENT_ITEM, FORUM_ITEM)
assert truncated is False
def test_empty_text_fallback():
empty = {**COMMENT_ITEM, "text": ""}
messages, truncated = gpt4o.build_messages(empty, FORUM_ITEM)
assert "[No body text provided]" in messages[1]["content"]
assert truncated is False
def test_none_text_fallback():
none_text = {**COMMENT_ITEM, "text": None}
messages, _ = gpt4o.build_messages(none_text, FORUM_ITEM)
assert "[No body text provided]" in messages[1]["content"]
def test_missing_forum_uses_unknown_context():
messages, _ = gpt4o.build_messages(COMMENT_ITEM, None)
assert "[unknown]" in messages[1]["content"]
def test_reg_context_included_in_prompt():
messages, _ = gpt4o.build_messages(COMMENT_ITEM, FORUM_ITEM)
assert FORUM_ITEM["reg_title"] in messages[1]["content"]
assert "HB 145" in messages[1]["content"]
# ---------------------------------------------------------------------------
# Output record schema
def test_output_record_all_keys_present():
client = _mock_client()
record = gpt4o.analyze_comment(client, COMMENT_ITEM, FORUM_ITEM, "run-123", "gpt-4o")
required = {
"run_id", "forum_id", "comment_id", "analyzed_at", "model", "prompt_version",
"stance", "stance_confidence", "stance_rationale", "tone", "tags",
"input_title", "truncated", "error",
}
assert required == set(record.keys())
def test_output_record_correct_types():
client = _mock_client()
record = gpt4o.analyze_comment(client, COMMENT_ITEM, FORUM_ITEM, "run-123", "gpt-4o")
assert record["stance"] == "support"
assert isinstance(record["stance_confidence"], float)
assert isinstance(record["tags"], list)
assert record["truncated"] is False
assert record["error"] is None
def test_output_record_metadata():
client = _mock_client()
record = gpt4o.analyze_comment(client, COMMENT_ITEM, FORUM_ITEM, "run-123", "gpt-4o")
assert record["run_id"] == "run-123"
assert record["forum_id"] == "452"
assert record["comment_id"] == "87914"
assert record["model"] == "gpt-4o"
assert record["prompt_version"] == gpt4o.PROMPT_VERSION
assert record["input_title"] == COMMENT_ITEM["title"]
# ---------------------------------------------------------------------------
# Error handling
def test_error_record_on_api_failure():
client = MagicMock()
import openai as _openai
client.chat.completions.create.side_effect = _openai.RateLimitError(
"rate limit", response=MagicMock(status_code=429), body={}
)
record = gpt4o.analyze_comment(client, COMMENT_ITEM, FORUM_ITEM, "run-123", "gpt-4o")
assert record["error"] is not None
assert record["stance"] is None
assert record["tone"] is None
assert record["tags"] is None
def test_error_record_on_bad_json():
client = _mock_client("not valid json{{{")
record = gpt4o.analyze_comment(client, COMMENT_ITEM, FORUM_ITEM, "run-123", "gpt-4o")
assert record["error"] is not None
assert record["stance"] is None
# ---------------------------------------------------------------------------
# run_id consistency
def test_run_id_is_shared_across_records():
client = _mock_client()
run_id = "fixed-run-id"
r1 = gpt4o.analyze_comment(client, COMMENT_ITEM, FORUM_ITEM, run_id, "gpt-4o")
r2 = gpt4o.analyze_comment(client, {**COMMENT_ITEM, "comment_id": "99999"}, FORUM_ITEM, run_id, "gpt-4o")
assert r1["run_id"] == r2["run_id"] == run_id
# ---------------------------------------------------------------------------
# Filename parsing
def test_scrape_ts_extracted_from_filename():
p = Path("output/forum452_comments_2026-05-05T17-33-54+00-00.jsonl")
ts = gpt4o._scrape_ts_from_filename(p)
assert ts == "2026-05-05T17-33-54+00-00"
def test_scrape_ts_fallback_for_unknown_filename():
p = Path("output/somefile.jsonl")
ts = gpt4o._scrape_ts_from_filename(p)
assert ts == "unknown"