remove hyphen for underscore in nomenclature, remove dependency

2026-05-05 16:47:11 -04:00
parent fd9d656e13
commit 683bfb324f
5 changed files with 67 additions and 127 deletions
--- a/tests/analysis_gpt4o_realtime.py
+++ b/tests/analysis_gpt4o_realtime.py
@@ -1,15 +1,14 @@
-"""Unit tests for analysis/gpt4o/analysis.py — no real API calls."""
+"""Unit tests for analysis/gpt4o/analysis_realtime.py — no real API calls."""

 import json
 import sys
 from pathlib import Path
-from unittest.mock import MagicMock, patch
+from unittest.mock import MagicMock

 import pytest

-# Make the module importable without installing as a package
 sys.path.insert(0, str(Path(__file__).parent.parent / "analysis" / "gpt4o"))
-import analysis as gpt4o
+import analysis_realtime as rt


 # ---------------------------------------------------------------------------
@@ -51,26 +50,25 @@ def _mock_client(response_content: str = MOCK_RESPONSE_CONTENT):
 # Prompt versioning

 def test_prompt_version_is_7_hex_chars():
-    assert len(gpt4o.PROMPT_VERSION) == 7
-    assert all(c in "0123456789abcdef" for c in gpt4o.PROMPT_VERSION)
+    assert len(rt.PROMPT_VERSION) == 7
+    assert all(c in "0123456789abcdef" for c in rt.PROMPT_VERSION)


-def test_prompt_version_changes_with_system_prompt():
+def test_prompt_version_matches_prompt_file():
    import hashlib
-    alt = hashlib.sha256(("CHANGED" + gpt4o.USER_TEMPLATE).encode("utf-8")).hexdigest()[:7]
-    assert alt != gpt4o.PROMPT_VERSION
+    prompt_file = Path(__file__).parent.parent / "analysis" / "prompt-1.txt"
+    expected = hashlib.sha256(prompt_file.read_text(encoding="utf-8").strip().encode()).hexdigest()[:7]
+    assert rt.PROMPT_VERSION == expected


 def test_prompt_version_is_stable():
    import hashlib
-    v2 = hashlib.sha256(
-        (gpt4o.SYSTEM_PROMPT + gpt4o.USER_TEMPLATE).encode("utf-8")
-    ).hexdigest()[:7]
-    assert v2 == gpt4o.PROMPT_VERSION
+    v2 = hashlib.sha256(rt.SYSTEM_PROMPT.encode("utf-8")).hexdigest()[:7]
+    assert v2 == rt.PROMPT_VERSION


 # ---------------------------------------------------------------------------
-# Item detection via load_items
+# load_items

 def test_load_items_separates_forum_and_comments(tmp_path):
    jsonl = tmp_path / "test.jsonl"
@@ -78,7 +76,7 @@ def test_load_items_separates_forum_and_comments(tmp_path):
        json.dumps(FORUM_ITEM) + "\n" + json.dumps(COMMENT_ITEM) + "\n",
        encoding="utf-8",
    )
-    forum, comments = gpt4o.load_items(jsonl)
+    forum, comments = rt.load_items(jsonl)
    assert forum is not None
    assert forum["reg_title"] == FORUM_ITEM["reg_title"]
    assert len(comments) == 1
@@ -88,18 +86,15 @@ def test_load_items_separates_forum_and_comments(tmp_path):
 def test_load_items_no_forum(tmp_path):
    jsonl = tmp_path / "test.jsonl"
    jsonl.write_text(json.dumps(COMMENT_ITEM) + "\n", encoding="utf-8")
-    forum, comments = gpt4o.load_items(jsonl)
+    forum, comments = rt.load_items(jsonl)
    assert forum is None
    assert len(comments) == 1


 def test_load_items_skips_blank_lines(tmp_path):
    jsonl = tmp_path / "test.jsonl"
-    jsonl.write_text(
-        "\n" + json.dumps(COMMENT_ITEM) + "\n\n",
-        encoding="utf-8",
-    )
-    _, comments = gpt4o.load_items(jsonl)
+    jsonl.write_text("\n" + json.dumps(COMMENT_ITEM) + "\n\n", encoding="utf-8")
+    _, comments = rt.load_items(jsonl)
    assert len(comments) == 1


@@ -108,40 +103,37 @@ def test_load_items_skips_blank_lines(tmp_path):

 def test_truncation_applied():
    long_comment = {**COMMENT_ITEM, "text": "x" * 7000}
-    messages, truncated = gpt4o.build_messages(long_comment, FORUM_ITEM)
+    messages, truncated = rt.build_messages(long_comment, FORUM_ITEM)
    assert truncated is True
-    user_content = messages[1]["content"]
-    assert "... [truncated]" in user_content
-    # The x's in the prompt must not exceed MAX_COMMENT_CHARS
-    x_count = user_content.count("x")
-    assert x_count == gpt4o.MAX_COMMENT_CHARS
+    assert "... [truncated]" in messages[1]["content"]
+    assert messages[1]["content"].count("x") == rt.MAX_COMMENT_CHARS


 def test_no_truncation_for_short_comment():
-    _, truncated = gpt4o.build_messages(COMMENT_ITEM, FORUM_ITEM)
+    _, truncated = rt.build_messages(COMMENT_ITEM, FORUM_ITEM)
    assert truncated is False


 def test_empty_text_fallback():
    empty = {**COMMENT_ITEM, "text": ""}
-    messages, truncated = gpt4o.build_messages(empty, FORUM_ITEM)
+    messages, truncated = rt.build_messages(empty, FORUM_ITEM)
    assert "[No body text provided]" in messages[1]["content"]
    assert truncated is False


 def test_none_text_fallback():
    none_text = {**COMMENT_ITEM, "text": None}
-    messages, _ = gpt4o.build_messages(none_text, FORUM_ITEM)
+    messages, _ = rt.build_messages(none_text, FORUM_ITEM)
    assert "[No body text provided]" in messages[1]["content"]


 def test_missing_forum_uses_unknown_context():
-    messages, _ = gpt4o.build_messages(COMMENT_ITEM, None)
+    messages, _ = rt.build_messages(COMMENT_ITEM, None)
    assert "[unknown]" in messages[1]["content"]


 def test_reg_context_included_in_prompt():
-    messages, _ = gpt4o.build_messages(COMMENT_ITEM, FORUM_ITEM)
+    messages, _ = rt.build_messages(COMMENT_ITEM, FORUM_ITEM)
    assert FORUM_ITEM["reg_title"] in messages[1]["content"]
    assert "HB 145" in messages[1]["content"]

@@ -150,8 +142,7 @@ def test_reg_context_included_in_prompt():
 # Output record schema

 def test_output_record_all_keys_present():
-    client = _mock_client()
-    record = gpt4o.analyze_comment(client, COMMENT_ITEM, FORUM_ITEM, "run-123", "gpt-4o")
+    record = rt.analyze_comment(_mock_client(), COMMENT_ITEM, FORUM_ITEM, "run-123", "gpt-4o")
    required = {
        "run_id", "forum_id", "comment_id", "analyzed_at", "model", "prompt_version",
        "stance", "stance_confidence", "stance_rationale", "tone", "tags",
@@ -161,8 +152,7 @@ def test_output_record_all_keys_present():


 def test_output_record_correct_types():
-    client = _mock_client()
-    record = gpt4o.analyze_comment(client, COMMENT_ITEM, FORUM_ITEM, "run-123", "gpt-4o")
+    record = rt.analyze_comment(_mock_client(), COMMENT_ITEM, FORUM_ITEM, "run-123", "gpt-4o")
    assert record["stance"] == "support"
    assert isinstance(record["stance_confidence"], float)
    assert isinstance(record["tags"], list)
@@ -171,13 +161,12 @@ def test_output_record_correct_types():


 def test_output_record_metadata():
-    client = _mock_client()
-    record = gpt4o.analyze_comment(client, COMMENT_ITEM, FORUM_ITEM, "run-123", "gpt-4o")
+    record = rt.analyze_comment(_mock_client(), COMMENT_ITEM, FORUM_ITEM, "run-123", "gpt-4o")
    assert record["run_id"] == "run-123"
    assert record["forum_id"] == "452"
    assert record["comment_id"] == "87914"
    assert record["model"] == "gpt-4o"
-    assert record["prompt_version"] == gpt4o.PROMPT_VERSION
+    assert record["prompt_version"] == rt.PROMPT_VERSION
    assert record["input_title"] == COMMENT_ITEM["title"]


@@ -185,12 +174,12 @@ def test_output_record_metadata():
 # Error handling

 def test_error_record_on_api_failure():
-    client = MagicMock()
    import openai as _openai
+    client = MagicMock()
    client.chat.completions.create.side_effect = _openai.RateLimitError(
        "rate limit", response=MagicMock(status_code=429), body={}
    )
-    record = gpt4o.analyze_comment(client, COMMENT_ITEM, FORUM_ITEM, "run-123", "gpt-4o")
+    record = rt.analyze_comment(client, COMMENT_ITEM, FORUM_ITEM, "run-123", "gpt-4o")
    assert record["error"] is not None
    assert record["stance"] is None
    assert record["tone"] is None
@@ -198,8 +187,7 @@ def test_error_record_on_api_failure():


 def test_error_record_on_bad_json():
-    client = _mock_client("not valid json{{{")
-    record = gpt4o.analyze_comment(client, COMMENT_ITEM, FORUM_ITEM, "run-123", "gpt-4o")
+    record = rt.analyze_comment(_mock_client("not valid json{{{"), COMMENT_ITEM, FORUM_ITEM, "run-123", "gpt-4o")
    assert record["error"] is not None
    assert record["stance"] is None

@@ -210,21 +198,18 @@ def test_error_record_on_bad_json():
 def test_run_id_is_shared_across_records():
    client = _mock_client()
    run_id = "fixed-run-id"
-    r1 = gpt4o.analyze_comment(client, COMMENT_ITEM, FORUM_ITEM, run_id, "gpt-4o")
-    r2 = gpt4o.analyze_comment(client, {**COMMENT_ITEM, "comment_id": "99999"}, FORUM_ITEM, run_id, "gpt-4o")
+    r1 = rt.analyze_comment(client, COMMENT_ITEM, FORUM_ITEM, run_id, "gpt-4o")
+    r2 = rt.analyze_comment(client, {**COMMENT_ITEM, "comment_id": "99999"}, FORUM_ITEM, run_id, "gpt-4o")
    assert r1["run_id"] == r2["run_id"] == run_id


 # ---------------------------------------------------------------------------
-# Filename parsing
+# Filename helpers

 def test_scrape_ts_extracted_from_filename():
    p = Path("output/forum452_comments_2026-05-05T17-33-54+00-00.jsonl")
-    ts = gpt4o._scrape_ts_from_filename(p)
-    assert ts == "2026-05-05T17-33-54+00-00"
+    assert rt._scrape_ts_from_filename(p) == "2026-05-05T17-33-54+00-00"


 def test_scrape_ts_fallback_for_unknown_filename():
-    p = Path("output/somefile.jsonl")
-    ts = gpt4o._scrape_ts_from_filename(p)
-    assert ts == "unknown"
+    assert rt._scrape_ts_from_filename(Path("output/somefile.jsonl")) == "unknown"
--- a/tests/scrape_forum_spider.py
+++ b/tests/scrape_forum_spider.py