added create_csv.py

2026-05-07 17:22:00 -04:00
parent 72c2ae0ca0
commit 28d6d222bd
6 changed files with 9583 additions and 11 deletions
--- a/tests/create_csv.py
+++ b/tests/create_csv.py
@@ -0,0 +1,155 @@
+"""Unit tests for analysis/create_csv.py — no external API calls."""
+
+import json
+import sys
+from pathlib import Path
+
+import pandas as pd
+import pytest
+
+sys.path.insert(0, str(Path(__file__).parent.parent / "analysis"))
+import create_csv as cc
+
+
+# ---------------------------------------------------------------------------
+# Helpers
+
+def _write_jsonl(path: Path, rows: list[dict]) -> None:
+    with open(path, "w", encoding="utf-8") as f:
+        for row in rows:
+            f.write(json.dumps(row) + "\n")
+
+
+RAW_ROWS = [
+    {"forum_id": "452", "comment_id": "1", "title": "Support", "text": "I support.", "date": "2021-01-01", "author": "Alice"},
+    {"forum_id": "452", "comment_id": "2", "title": "Oppose",  "text": "I oppose.",  "date": "2021-01-02", "author": "Bob"},
+    {"forum_id": "452", "comment_id": "3", "title": "Neutral", "text": "No opinion.","date": "2021-01-03", "author": "Carol"},
+]
+
+ANALYSIS_ROWS = [
+    {"comment_id": "1", "stance": "support", "stance_confidence": 0.9, "stance_rationale": "clear support",
+     "tone": "neutral", "tags": '["policy"]', "error": None, "truncated": False,
+     "analyzed_at": "2021-01-10", "prompt_version": "1", "model": "gpt-4o-mini"},
+    {"comment_id": "2", "stance": "oppose",  "stance_confidence": 0.8, "stance_rationale": "clear oppose",
+     "tone": "negative", "tags": '[]', "error": None, "truncated": False,
+     "analyzed_at": "2021-01-10", "prompt_version": "1", "model": "gpt-4o-mini"},
+]
+
+
+# ---------------------------------------------------------------------------
+# load_raw
+
+def test_load_raw_returns_raw_cols(tmp_path):
+    p = tmp_path / "forum.jsonl"
+    _write_jsonl(p, RAW_ROWS)
+    df = cc.load_raw(p)
+    assert list(df.columns) == cc.RAW_COLS
+
+
+def test_load_raw_row_count(tmp_path):
+    p = tmp_path / "forum.jsonl"
+    _write_jsonl(p, RAW_ROWS)
+    df = cc.load_raw(p)
+    assert len(df) == 3
+
+
+def test_load_raw_skips_non_comment_rows(tmp_path):
+    """Rows without comment_id (e.g. forum metadata) are dropped."""
+    rows = RAW_ROWS + [{"forum_id": "452", "reg_title": "Metadata row"}]
+    p = tmp_path / "forum.jsonl"
+    _write_jsonl(p, rows)
+    df = cc.load_raw(p)
+    assert len(df) == 3
+
+
+# ---------------------------------------------------------------------------
+# load_analysis
+
+def test_load_analysis_returns_analysis_cols(tmp_path):
+    jobs = tmp_path / "jobs"
+    jobs.mkdir()
+    _write_jsonl(jobs / "job1-output.jsonl", ANALYSIS_ROWS)
+    df = cc.load_analysis(jobs)
+    expected = ["comment_id"] + cc.ANALYSIS_COLS
+    assert list(df.columns) == expected
+
+
+def test_load_analysis_skips_raw_files(tmp_path):
+    jobs = tmp_path / "jobs"
+    jobs.mkdir()
+    _write_jsonl(jobs / "job1-output.jsonl", ANALYSIS_ROWS)
+    _write_jsonl(jobs / "job1-output-raw.jsonl", ANALYSIS_ROWS)  # should be ignored
+    df = cc.load_analysis(jobs)
+    assert len(df) == len(ANALYSIS_ROWS)
+
+
+def test_load_analysis_concatenates_multiple_files(tmp_path):
+    jobs = tmp_path / "jobs"
+    jobs.mkdir()
+    _write_jsonl(jobs / "job1-output.jsonl", [ANALYSIS_ROWS[0]])
+    _write_jsonl(jobs / "job2-output.jsonl", [ANALYSIS_ROWS[1]])
+    df = cc.load_analysis(jobs)
+    assert len(df) == 2
+
+
+# ---------------------------------------------------------------------------
+# join
+
+def test_join_all_raw_preserved(tmp_path):
+    """Left join: all raw comments appear in output, even without analysis."""
+    raw = pd.DataFrame(RAW_ROWS)[cc.RAW_COLS]
+    analysis = pd.DataFrame(ANALYSIS_ROWS)
+    for col in cc.ANALYSIS_COLS:
+        if col not in analysis.columns:
+            analysis[col] = None
+    analysis = analysis[["comment_id"] + cc.ANALYSIS_COLS]
+
+    merged = cc.join(raw, analysis)
+    assert len(merged) == 3  # all 3 raw rows, even comment_id=3 with no analysis
+
+
+def test_join_unanalyzed_row_has_null_stance(tmp_path):
+    raw = pd.DataFrame(RAW_ROWS)[cc.RAW_COLS]
+    analysis = pd.DataFrame(ANALYSIS_ROWS)
+    for col in cc.ANALYSIS_COLS:
+        if col not in analysis.columns:
+            analysis[col] = None
+    analysis = analysis[["comment_id"] + cc.ANALYSIS_COLS]
+
+    merged = cc.join(raw, analysis)
+    unanalyzed = merged[merged["comment_id"] == "3"]
+    assert pd.isna(unanalyzed.iloc[0]["stance"])
+
+
+def test_join_column_order(tmp_path):
+    raw = pd.DataFrame(RAW_ROWS)[cc.RAW_COLS]
+    analysis = pd.DataFrame(ANALYSIS_ROWS)
+    for col in cc.ANALYSIS_COLS:
+        if col not in analysis.columns:
+            analysis[col] = None
+    analysis = analysis[["comment_id"] + cc.ANALYSIS_COLS]
+
+    merged = cc.join(raw, analysis)
+    assert list(merged.columns) == cc.OUTPUT_COLS
+
+
+# ---------------------------------------------------------------------------
+# End-to-end: write + read CSV
+
+def test_csv_written_correctly(tmp_path):
+    raw_path = tmp_path / "forum.jsonl"
+    _write_jsonl(raw_path, RAW_ROWS)
+
+    jobs = tmp_path / "jobs"
+    jobs.mkdir()
+    _write_jsonl(jobs / "job1-output.jsonl", ANALYSIS_ROWS)
+
+    out = tmp_path / "review.csv"
+    raw      = cc.load_raw(raw_path)
+    analysis = cc.load_analysis(jobs)
+    merged   = cc.join(raw, analysis)
+    merged.to_csv(out, index=False, encoding="utf-8-sig")
+
+    loaded = pd.read_csv(out)
+    assert len(loaded) == 3
+    assert list(loaded.columns) == cc.OUTPUT_COLS
--- a/tests/validate-sentiment.py
+++ b/tests/validate-sentiment.py
@@ -0,0 +1,217 @@
+"""Unit tests for analysis/validate.py — no file I/O beyond tmp_path."""
+
+import json
+import sys
+from pathlib import Path
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).parent.parent / "analysis"))
+
+try:
+    import pandas as pd
+except ImportError:
+    pytest.skip("pandas not installed", allow_module_level=True)
+
+import validate as vl
+
+# ---------------------------------------------------------------------------
+# Fixtures
+
+
+def _write_jsonl(path: Path, rows: list[dict]) -> None:
+    with open(path, "w", encoding="utf-8") as f:
+        for row in rows:
+            f.write(json.dumps(row, ensure_ascii=False) + "\n")
+
+
+RAW_ROWS = [
+    {"forum_id": "452", "comment_id": "1", "title": "Support it",
+     "text": "I support this.", "date": "2021-01-04T09:00:00", "author": "Alice"},
+    {"forum_id": "452", "comment_id": "2", "title": "Oppose it",
+     "text": "I oppose this.", "date": "2021-01-05T10:00:00", "author": "Bob"},
+    {"forum_id": "452", "comment_id": "3", "title": "Neutral",
+     "text": "No opinion.", "date": "2021-01-06T11:00:00", "author": "Carol"},
+]
+
+ANALYSIS_ROWS = [
+    {"run_id": "r1", "forum_id": "452", "comment_id": "1", "input_title": "Support it",
+     "analyzed_at": "2026-05-06T12:00:00+00:00", "model": "gpt-5.4-mini",
+     "prompt_version": "abc1234", "stance": "support", "stance_confidence": 0.95,
+     "stance_rationale": "Commenter says 'I support'.", "tone": "positive",
+     "tags": ["student safety"], "truncated": False, "error": None},
+    {"run_id": "r1", "forum_id": "452", "comment_id": "2", "input_title": "Oppose it",
+     "analyzed_at": "2026-05-06T12:00:00+00:00", "model": "gpt-5.4-mini",
+     "prompt_version": "abc1234", "stance": "oppose", "stance_confidence": 0.90,
+     "stance_rationale": "Commenter says 'I oppose'.", "tone": "negative",
+     "tags": [], "truncated": False, "error": None},
+]
+
+FORUM_ROW = {"forum_id": "452", "reg_title": "Policy X", "reg_desc": "Guidance on Y."}
+
+
+@pytest.fixture()
+def raw_jsonl(tmp_path) -> Path:
+    p = tmp_path / "f452.jsonl"
+    _write_jsonl(p, [FORUM_ROW] + RAW_ROWS)
+    return p
+
+
+@pytest.fixture()
+def jobs_dir(tmp_path) -> Path:
+    d = tmp_path / "jobs" / "f452-1"
+    d.mkdir(parents=True)
+    _write_jsonl(d / "job1-output.jsonl", ANALYSIS_ROWS)
+    return d
+
+
+# ---------------------------------------------------------------------------
+# load_raw
+
+
+def test_load_raw_returns_only_comments(raw_jsonl):
+    df = vl.load_raw(raw_jsonl)
+    assert len(df) == 3
+    assert set(df.columns) == set(vl.RAW_COLS)
+
+
+def test_load_raw_correct_columns(raw_jsonl):
+    df = vl.load_raw(raw_jsonl)
+    for col in vl.RAW_COLS:
+        assert col in df.columns
+
+
+def test_load_raw_skips_forum_item(raw_jsonl):
+    df = vl.load_raw(raw_jsonl)
+    assert "reg_title" not in df.columns
+
+
+# ---------------------------------------------------------------------------
+# load_analysis
+
+
+def test_load_analysis_skips_raw_files(tmp_path):
+    d = tmp_path / "jobs" / "f452-1"
+    d.mkdir(parents=True)
+    _write_jsonl(d / "job1-output-raw.jsonl", ANALYSIS_ROWS)   # should be ignored
+    _write_jsonl(d / "job1-output.jsonl", ANALYSIS_ROWS)
+    df = vl.load_analysis(d)
+    assert len(df) == len(ANALYSIS_ROWS)
+
+
+def test_load_analysis_concatenates_multiple_files(tmp_path):
+    d = tmp_path / "jobs" / "f452-1"
+    d.mkdir(parents=True)
+    _write_jsonl(d / "job1-output.jsonl", [ANALYSIS_ROWS[0]])
+    _write_jsonl(d / "job2-output.jsonl", [ANALYSIS_ROWS[1]])
+    df = vl.load_analysis(d)
+    assert len(df) == 2
+
+
+def test_load_analysis_tags_serialized_as_json(jobs_dir):
+    df = vl.load_analysis(jobs_dir)
+    tags_val = df.loc[df["comment_id"] == "1", "tags"].iloc[0]
+    assert isinstance(tags_val, str)
+    assert json.loads(tags_val) == ["student safety"]
+
+
+def test_load_analysis_empty_tags_serialized(jobs_dir):
+    df = vl.load_analysis(jobs_dir)
+    tags_val = df.loc[df["comment_id"] == "2", "tags"].iloc[0]
+    assert json.loads(tags_val) == []
+
+
+# ---------------------------------------------------------------------------
+# join — by comment_id, not index
+
+
+def test_join_by_comment_id_not_index(raw_jsonl, jobs_dir):
+    raw      = vl.load_raw(raw_jsonl)
+    analysis = vl.load_analysis(jobs_dir)
+    # Shuffle raw order so comment_id ordering differs from index
+    raw = raw.sample(frac=1, random_state=42).reset_index(drop=True)
+    merged = vl.join(raw, analysis)
+    row_1 = merged[merged["comment_id"] == "1"].iloc[0]
+    assert row_1["stance"] == "support"
+    assert row_1["author"] == "Alice"
+
+
+def test_join_unanalyzed_comment_has_null_stance(raw_jsonl, jobs_dir):
+    """Comment 3 is in raw but not in analysis — stance should be NaN."""
+    raw      = vl.load_raw(raw_jsonl)
+    analysis = vl.load_analysis(jobs_dir)
+    merged   = vl.join(raw, analysis)
+    row_3 = merged[merged["comment_id"] == "3"].iloc[0]
+    assert pd.isna(row_3["stance"])
+
+
+def test_join_preserves_all_raw_comments(raw_jsonl, jobs_dir):
+    raw      = vl.load_raw(raw_jsonl)
+    analysis = vl.load_analysis(jobs_dir)
+    merged   = vl.join(raw, analysis)
+    assert len(merged) == len(raw)
+
+
+def test_join_output_columns_in_order(raw_jsonl, jobs_dir):
+    raw      = vl.load_raw(raw_jsonl)
+    analysis = vl.load_analysis(jobs_dir)
+    merged   = vl.join(raw, analysis)
+    assert list(merged.columns) == vl.OUTPUT_COLS
+
+
+# ---------------------------------------------------------------------------
+# Duplicate comment_id handling
+
+
+def test_duplicate_raw_id_flagged(raw_jsonl, jobs_dir):
+    raw      = vl.load_raw(raw_jsonl)
+    # Manually duplicate a row
+    raw = pd.concat([raw, raw.iloc[[0]]], ignore_index=True)
+    analysis = vl.load_analysis(jobs_dir)
+    merged   = vl.join(raw, analysis)
+    # join still produces a row for each raw row (left join)
+    assert len(merged) == len(raw)
+    assert raw["comment_id"].duplicated().sum() == 1
+
+
+def test_duplicate_analysis_id_produces_extra_rows(raw_jsonl, tmp_path):
+    """Two analysis records for the same comment_id create two joined rows."""
+    d = tmp_path / "jobs" / "f452-dup"
+    d.mkdir(parents=True)
+    dup_rows = [ANALYSIS_ROWS[0], {**ANALYSIS_ROWS[0], "stance": "oppose"}]
+    _write_jsonl(d / "job1-output.jsonl", dup_rows)
+    raw      = vl.load_raw(raw_jsonl)
+    analysis = vl.load_analysis(d)
+    merged   = vl.join(raw, analysis)
+    assert len(merged[merged["comment_id"] == "1"]) == 2
+
+
+# ---------------------------------------------------------------------------
+# Validation counts (smoke test — just confirm it runs without error)
+
+
+def test_print_validation_runs(raw_jsonl, jobs_dir, capsys):
+    raw      = vl.load_raw(raw_jsonl)
+    analysis = vl.load_analysis(jobs_dir)
+    merged   = vl.join(raw, analysis)
+    vl.print_validation(raw, analysis, merged)
+    out = capsys.readouterr().out
+    assert "Raw comments" in out
+    assert "Stance counts" in out
+    assert "Tone counts" in out
+
+
+# ---------------------------------------------------------------------------
+# CSV output
+
+
+def test_csv_written_to_jobs_dir(raw_jsonl, jobs_dir, tmp_path):
+    raw      = vl.load_raw(raw_jsonl)
+    analysis = vl.load_analysis(jobs_dir)
+    merged   = vl.join(raw, analysis)
+    out_path = jobs_dir / "review.csv"
+    merged.to_csv(out_path, index=False, encoding="utf-8-sig")
+    assert out_path.exists()
+    loaded = pd.read_csv(out_path, encoding="utf-8-sig")
+    assert list(loaded.columns) == vl.OUTPUT_COLS
+    assert len(loaded) == len(raw)