added create_csv.py
This commit is contained in:
76
analysis/create_csv.py
Normal file
76
analysis/create_csv.py
Normal file
@@ -0,0 +1,76 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""analysis/create_csv.py — join raw scrape with analysis output for review."""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
RAW_COLS = ["forum_id", "comment_id", "title", "text", "date", "author"]
|
||||||
|
ANALYSIS_COLS = [
|
||||||
|
"stance", "stance_confidence", "stance_rationale", "tone", "tags",
|
||||||
|
"error", "truncated", "analyzed_at", "prompt_version", "model",
|
||||||
|
]
|
||||||
|
OUTPUT_COLS = RAW_COLS + ANALYSIS_COLS
|
||||||
|
|
||||||
|
|
||||||
|
def load_raw(path: Path) -> pd.DataFrame:
|
||||||
|
df = pd.read_json(path, lines=True)
|
||||||
|
df = df[df["comment_id"].notna()] # rm first item (forum, not comment)
|
||||||
|
for col in RAW_COLS:
|
||||||
|
if col not in df.columns:
|
||||||
|
df[col] = None
|
||||||
|
return df[RAW_COLS].copy()
|
||||||
|
|
||||||
|
|
||||||
|
def load_analysis(jobs_dir: Path) -> pd.DataFrame:
|
||||||
|
files = sorted(p for p in jobs_dir.glob("job*-output.jsonl") if "-raw" not in p.name)
|
||||||
|
df = pd.concat([pd.read_json(p, lines=True) for p in files], ignore_index=True)
|
||||||
|
for col in ANALYSIS_COLS:
|
||||||
|
if col not in df.columns:
|
||||||
|
df[col] = None
|
||||||
|
return df[["comment_id"] + ANALYSIS_COLS].copy()
|
||||||
|
|
||||||
|
|
||||||
|
def join(raw: pd.DataFrame, analysis: pd.DataFrame) -> pd.DataFrame:
|
||||||
|
return raw.merge(analysis, on="comment_id", how="left")[OUTPUT_COLS]
|
||||||
|
|
||||||
|
|
||||||
|
def print_counts(raw: pd.DataFrame, analysis: pd.DataFrame, merged: pd.DataFrame) -> None:
|
||||||
|
print(f"\nRaw comments : {len(raw):,}")
|
||||||
|
print(f"Analyzed : {len(analysis):,}")
|
||||||
|
print(f"Joined : {merged['stance'].notna().sum():,}")
|
||||||
|
print(f"Unanalyzed : {merged['stance'].isna().sum():,}")
|
||||||
|
print(f"Errors : {analysis['error'].notna().sum():,}")
|
||||||
|
print(f"Dup IDs (raw) : {raw['comment_id'].duplicated().sum():,}")
|
||||||
|
print(f"\nStance:\n{analysis['stance'].value_counts(dropna=False).to_string()}")
|
||||||
|
print(f"\nTone:\n{analysis['tone'].value_counts(dropna=False).to_string()}\n")
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
p = argparse.ArgumentParser(
|
||||||
|
description="Join raw scrape JSONL with analysis output; write review CSV."
|
||||||
|
)
|
||||||
|
p.add_argument("input", help="Raw scrape JSONL (e.g. output/f452.jsonl)")
|
||||||
|
p.add_argument("jobs_dir", help="Job directory containing job*-output.jsonl files")
|
||||||
|
p.add_argument("--parquet", action="store_true", help="Also write review.parquet")
|
||||||
|
p.add_argument("--out", default=None, help="Output CSV path (default: <jobs_dir>/review.csv)")
|
||||||
|
args = p.parse_args()
|
||||||
|
|
||||||
|
raw = load_raw(Path(args.input))
|
||||||
|
analysis = load_analysis(Path(args.jobs_dir))
|
||||||
|
merged = join(raw, analysis)
|
||||||
|
print_counts(raw, analysis, merged)
|
||||||
|
|
||||||
|
out = Path(args.out) if args.out else Path(args.jobs_dir) / "review.csv"
|
||||||
|
merged.to_csv(out, index=False, encoding="utf-8-sig")
|
||||||
|
print(f"CSV → {out}")
|
||||||
|
|
||||||
|
if args.parquet:
|
||||||
|
pq = out.with_suffix(".parquet")
|
||||||
|
merged.to_parquet(pq, index=False)
|
||||||
|
print(f"Parquet → {pq}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
9091
analysis/jobs/f452-1/review.csv
Normal file
9091
analysis/jobs/f452-1/review.csv
Normal file
File diff suppressed because one or more lines are too long
BIN
analysis/jobs/f452-1/review.xlsx
Normal file
BIN
analysis/jobs/f452-1/review.xlsx
Normal file
Binary file not shown.
@@ -244,9 +244,9 @@ python analysis/openai_batch.py submit
|
|||||||
- tests: passing (pytest tests/openai_batch.py tests/openai_realtime.py tests/tokenizer.py)
|
- tests: passing (pytest tests/openai_batch.py tests/openai_realtime.py tests/tokenizer.py)
|
||||||
- datetime: [2026-05-06 Wed]
|
- datetime: [2026-05-06 Wed]
|
||||||
|
|
||||||
* === Backlog ===
|
* [X] t1.3: cleanup model output and rejoin
|
||||||
* [ ] X: analysis validation view
|
|
||||||
create a lightweight validation script that joins raw comments to normalized analysis output and writes a human-reviewable csv.
|
create a lightweight validation script that joins raw comments to normalized analysis output and writes a human-reviewable csv.
|
||||||
|
review create_csv for the simple approach - keep this regardless
|
||||||
|
|
||||||
** acceptance criteria
|
** acceptance criteria
|
||||||
1. input raw scrape jsonl and all *-output.jsonl files in a dir
|
1. input raw scrape jsonl and all *-output.jsonl files in a dir
|
||||||
@@ -255,7 +255,8 @@ create a lightweight validation script that joins raw comments to normalized ana
|
|||||||
- forum_id, comment_id, title, text, date, author
|
- forum_id, comment_id, title, text, date, author
|
||||||
- stance, stance_confidence, stance_rationale, tone, tags
|
- stance, stance_confidence, stance_rationale, tone, tags
|
||||||
- error, truncated, analyzed_at, prompt_version, model
|
- error, truncated, analyzed_at, prompt_version, model
|
||||||
4. print validation counts
|
4. output parquet?
|
||||||
|
5. print validation counts
|
||||||
- raw comments
|
- raw comments
|
||||||
- analyzed records
|
- analyzed records
|
||||||
- joined records
|
- joined records
|
||||||
@@ -264,16 +265,30 @@ create a lightweight validation script that joins raw comments to normalized ana
|
|||||||
- error records
|
- error records
|
||||||
- stance counts
|
- stance counts
|
||||||
- tone counts
|
- tone counts
|
||||||
5. tests cover join behavior and missing/duplicate ids
|
6. tests cover join behavior and missing/duplicate ids
|
||||||
|
|
||||||
|
** notes
|
||||||
|
- analysis/create_csv.py: reads raw scrape JSONL + all job*-output.jsonl in a job dir (skips *-output-raw.jsonl); left-joins on comment_id; writes review.csv (UTF-8 BOM for Excel); optional --parquet.
|
||||||
|
- Uses pd.read_json(path, lines=True) — no manual JSON parsing.
|
||||||
|
- Prints summary counts: raw/analyzed/joined/unanalyzed/errors/duplicate IDs, stance distribution, tone distribution.
|
||||||
|
|
||||||
|
*** usage
|
||||||
|
#+begin_src sh
|
||||||
|
python analysis/create_csv.py output/f452.jsonl analysis/jobs/f452-1/
|
||||||
|
python analysis/create_csv.py output/f452.jsonl analysis/jobs/f452-1/ --parquet
|
||||||
|
# output: analysis/jobs/f452-1/review.csv (and optionally review.parquet)
|
||||||
|
#+end_src
|
||||||
|
|
||||||
** evidence
|
** evidence
|
||||||
- commit:
|
- commit:
|
||||||
- tests:
|
- tests: passing (pytest tests/create_csv.py tests/encoding.py)
|
||||||
- csv:
|
- csv: analysis/jobs/f452-1/review.csv
|
||||||
- datetime:
|
- datetime: [2026-05-07 Thu]
|
||||||
* [ ] X: text encoding cleanup
|
|
||||||
|
* [X] t1.1.1: text encoding cleanup
|
||||||
fix mojibake in scraped text before analysis/reporting, especially curly quotes showing as ’.
|
fix mojibake in scraped text before analysis/reporting, especially curly quotes showing as ’.
|
||||||
|
|
||||||
|
|
||||||
** acceptance criteria
|
** acceptance criteria
|
||||||
1. identify whether mojibake exists in raw scrape, analysis output, or csv export only
|
1. identify whether mojibake exists in raw scrape, analysis output, or csv export only
|
||||||
2. add repair step at the earliest correct layer
|
2. add repair step at the earliest correct layer
|
||||||
@@ -286,11 +301,29 @@ fix mojibake in scraped text before analysis/reporting, especially curly quotes
|
|||||||
- —
|
- —
|
||||||
5. document whether repaired text is used for model input
|
5. document whether repaired text is used for model input
|
||||||
|
|
||||||
|
** notes
|
||||||
|
- Diagnosis: f452.jsonl raw data is CLEAN — proper Unicode throughout (U+2019, U+201C, etc.). The DEFAULT_RESPONSE_ENCODING=utf-8 spider setting is working for this site. No mojibake or FFFD chars found.
|
||||||
|
- The encoding issue would surface for forums whose server sends cp1252 bytes (0x91-0x97 range) embedded in otherwise UTF-8 content. FFFD replacement chars appear when the UTF-8 decoder hits those bytes. Once the byte is replaced by FFFD, the original character cannot be recovered.
|
||||||
|
- Repair layer: analysis/encoding.py applied in analysis/validate.py at reporting time. Raw scrape JSONL is never modified (AC3).
|
||||||
|
- Model input: repair_text() is NOT applied in build_messages() for this dataset since raw data is clean. Can be added if a future forum produces dirty text.
|
||||||
|
- Spider: DEFAULT_RESPONSE_ENCODING=utf-8 remains. If a future forum genuinely sends cp1252, change to 'cp1252' and apply ftfy post-decode in the item pipeline.
|
||||||
|
|
||||||
** evidence
|
** evidence
|
||||||
- commit:
|
- commit:
|
||||||
- tests:
|
- tests: passing (pytest tests/encoding.py)
|
||||||
- before/after sample:
|
- before/after sample: N/A — f452.jsonl is clean; tests cover synthetic mojibake patterns
|
||||||
- datetime:
|
- datetime: [2026-05-07 Thu]
|
||||||
|
* === Backlog ===
|
||||||
|
* [ ] X: first dash explorer
|
||||||
|
create a local dash app for exploring one forum analysis dataset.
|
||||||
|
|
||||||
|
** acceptance criteria
|
||||||
|
1. load parquet/csv review dataset
|
||||||
|
2. show stance counts, tone counts, tag counts, and confidence histogram
|
||||||
|
3. provide filters for stance, tone, confidence, tag, and text search
|
||||||
|
4. show filtered comment table
|
||||||
|
5. clicking/selecting a comment shows full text and model rationale
|
||||||
|
6. app runs locally with one command
|
||||||
* [ ] X: complete proposal information
|
* [ ] X: complete proposal information
|
||||||
Ensure we capture as much useful information as possible about the actual proposal - contact information, etc. what the state actually says about what was posted.
|
Ensure we capture as much useful information as possible about the actual proposal - contact information, etc. what the state actually says about what was posted.
|
||||||
** acceptance criteria
|
** acceptance criteria
|
||||||
|
|||||||
155
tests/create_csv.py
Normal file
155
tests/create_csv.py
Normal file
@@ -0,0 +1,155 @@
|
|||||||
|
"""Unit tests for analysis/create_csv.py — no external API calls."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent / "analysis"))
|
||||||
|
import create_csv as cc
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
|
||||||
|
def _write_jsonl(path: Path, rows: list[dict]) -> None:
|
||||||
|
with open(path, "w", encoding="utf-8") as f:
|
||||||
|
for row in rows:
|
||||||
|
f.write(json.dumps(row) + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
RAW_ROWS = [
|
||||||
|
{"forum_id": "452", "comment_id": "1", "title": "Support", "text": "I support.", "date": "2021-01-01", "author": "Alice"},
|
||||||
|
{"forum_id": "452", "comment_id": "2", "title": "Oppose", "text": "I oppose.", "date": "2021-01-02", "author": "Bob"},
|
||||||
|
{"forum_id": "452", "comment_id": "3", "title": "Neutral", "text": "No opinion.","date": "2021-01-03", "author": "Carol"},
|
||||||
|
]
|
||||||
|
|
||||||
|
ANALYSIS_ROWS = [
|
||||||
|
{"comment_id": "1", "stance": "support", "stance_confidence": 0.9, "stance_rationale": "clear support",
|
||||||
|
"tone": "neutral", "tags": '["policy"]', "error": None, "truncated": False,
|
||||||
|
"analyzed_at": "2021-01-10", "prompt_version": "1", "model": "gpt-4o-mini"},
|
||||||
|
{"comment_id": "2", "stance": "oppose", "stance_confidence": 0.8, "stance_rationale": "clear oppose",
|
||||||
|
"tone": "negative", "tags": '[]', "error": None, "truncated": False,
|
||||||
|
"analyzed_at": "2021-01-10", "prompt_version": "1", "model": "gpt-4o-mini"},
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# load_raw
|
||||||
|
|
||||||
|
def test_load_raw_returns_raw_cols(tmp_path):
|
||||||
|
p = tmp_path / "forum.jsonl"
|
||||||
|
_write_jsonl(p, RAW_ROWS)
|
||||||
|
df = cc.load_raw(p)
|
||||||
|
assert list(df.columns) == cc.RAW_COLS
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_raw_row_count(tmp_path):
|
||||||
|
p = tmp_path / "forum.jsonl"
|
||||||
|
_write_jsonl(p, RAW_ROWS)
|
||||||
|
df = cc.load_raw(p)
|
||||||
|
assert len(df) == 3
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_raw_skips_non_comment_rows(tmp_path):
|
||||||
|
"""Rows without comment_id (e.g. forum metadata) are dropped."""
|
||||||
|
rows = RAW_ROWS + [{"forum_id": "452", "reg_title": "Metadata row"}]
|
||||||
|
p = tmp_path / "forum.jsonl"
|
||||||
|
_write_jsonl(p, rows)
|
||||||
|
df = cc.load_raw(p)
|
||||||
|
assert len(df) == 3
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# load_analysis
|
||||||
|
|
||||||
|
def test_load_analysis_returns_analysis_cols(tmp_path):
|
||||||
|
jobs = tmp_path / "jobs"
|
||||||
|
jobs.mkdir()
|
||||||
|
_write_jsonl(jobs / "job1-output.jsonl", ANALYSIS_ROWS)
|
||||||
|
df = cc.load_analysis(jobs)
|
||||||
|
expected = ["comment_id"] + cc.ANALYSIS_COLS
|
||||||
|
assert list(df.columns) == expected
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_analysis_skips_raw_files(tmp_path):
|
||||||
|
jobs = tmp_path / "jobs"
|
||||||
|
jobs.mkdir()
|
||||||
|
_write_jsonl(jobs / "job1-output.jsonl", ANALYSIS_ROWS)
|
||||||
|
_write_jsonl(jobs / "job1-output-raw.jsonl", ANALYSIS_ROWS) # should be ignored
|
||||||
|
df = cc.load_analysis(jobs)
|
||||||
|
assert len(df) == len(ANALYSIS_ROWS)
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_analysis_concatenates_multiple_files(tmp_path):
|
||||||
|
jobs = tmp_path / "jobs"
|
||||||
|
jobs.mkdir()
|
||||||
|
_write_jsonl(jobs / "job1-output.jsonl", [ANALYSIS_ROWS[0]])
|
||||||
|
_write_jsonl(jobs / "job2-output.jsonl", [ANALYSIS_ROWS[1]])
|
||||||
|
df = cc.load_analysis(jobs)
|
||||||
|
assert len(df) == 2
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# join
|
||||||
|
|
||||||
|
def test_join_all_raw_preserved(tmp_path):
|
||||||
|
"""Left join: all raw comments appear in output, even without analysis."""
|
||||||
|
raw = pd.DataFrame(RAW_ROWS)[cc.RAW_COLS]
|
||||||
|
analysis = pd.DataFrame(ANALYSIS_ROWS)
|
||||||
|
for col in cc.ANALYSIS_COLS:
|
||||||
|
if col not in analysis.columns:
|
||||||
|
analysis[col] = None
|
||||||
|
analysis = analysis[["comment_id"] + cc.ANALYSIS_COLS]
|
||||||
|
|
||||||
|
merged = cc.join(raw, analysis)
|
||||||
|
assert len(merged) == 3 # all 3 raw rows, even comment_id=3 with no analysis
|
||||||
|
|
||||||
|
|
||||||
|
def test_join_unanalyzed_row_has_null_stance(tmp_path):
|
||||||
|
raw = pd.DataFrame(RAW_ROWS)[cc.RAW_COLS]
|
||||||
|
analysis = pd.DataFrame(ANALYSIS_ROWS)
|
||||||
|
for col in cc.ANALYSIS_COLS:
|
||||||
|
if col not in analysis.columns:
|
||||||
|
analysis[col] = None
|
||||||
|
analysis = analysis[["comment_id"] + cc.ANALYSIS_COLS]
|
||||||
|
|
||||||
|
merged = cc.join(raw, analysis)
|
||||||
|
unanalyzed = merged[merged["comment_id"] == "3"]
|
||||||
|
assert pd.isna(unanalyzed.iloc[0]["stance"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_join_column_order(tmp_path):
|
||||||
|
raw = pd.DataFrame(RAW_ROWS)[cc.RAW_COLS]
|
||||||
|
analysis = pd.DataFrame(ANALYSIS_ROWS)
|
||||||
|
for col in cc.ANALYSIS_COLS:
|
||||||
|
if col not in analysis.columns:
|
||||||
|
analysis[col] = None
|
||||||
|
analysis = analysis[["comment_id"] + cc.ANALYSIS_COLS]
|
||||||
|
|
||||||
|
merged = cc.join(raw, analysis)
|
||||||
|
assert list(merged.columns) == cc.OUTPUT_COLS
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# End-to-end: write + read CSV
|
||||||
|
|
||||||
|
def test_csv_written_correctly(tmp_path):
|
||||||
|
raw_path = tmp_path / "forum.jsonl"
|
||||||
|
_write_jsonl(raw_path, RAW_ROWS)
|
||||||
|
|
||||||
|
jobs = tmp_path / "jobs"
|
||||||
|
jobs.mkdir()
|
||||||
|
_write_jsonl(jobs / "job1-output.jsonl", ANALYSIS_ROWS)
|
||||||
|
|
||||||
|
out = tmp_path / "review.csv"
|
||||||
|
raw = cc.load_raw(raw_path)
|
||||||
|
analysis = cc.load_analysis(jobs)
|
||||||
|
merged = cc.join(raw, analysis)
|
||||||
|
merged.to_csv(out, index=False, encoding="utf-8-sig")
|
||||||
|
|
||||||
|
loaded = pd.read_csv(out)
|
||||||
|
assert len(loaded) == 3
|
||||||
|
assert list(loaded.columns) == cc.OUTPUT_COLS
|
||||||
217
tests/validate-sentiment.py
Normal file
217
tests/validate-sentiment.py
Normal file
@@ -0,0 +1,217 @@
|
|||||||
|
"""Unit tests for analysis/validate.py — no file I/O beyond tmp_path."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent / "analysis"))
|
||||||
|
|
||||||
|
try:
|
||||||
|
import pandas as pd
|
||||||
|
except ImportError:
|
||||||
|
pytest.skip("pandas not installed", allow_module_level=True)
|
||||||
|
|
||||||
|
import validate as vl
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Fixtures
|
||||||
|
|
||||||
|
|
||||||
|
def _write_jsonl(path: Path, rows: list[dict]) -> None:
|
||||||
|
with open(path, "w", encoding="utf-8") as f:
|
||||||
|
for row in rows:
|
||||||
|
f.write(json.dumps(row, ensure_ascii=False) + "\n")
|
||||||
|
|
||||||
|
|
||||||
|
RAW_ROWS = [
|
||||||
|
{"forum_id": "452", "comment_id": "1", "title": "Support it",
|
||||||
|
"text": "I support this.", "date": "2021-01-04T09:00:00", "author": "Alice"},
|
||||||
|
{"forum_id": "452", "comment_id": "2", "title": "Oppose it",
|
||||||
|
"text": "I oppose this.", "date": "2021-01-05T10:00:00", "author": "Bob"},
|
||||||
|
{"forum_id": "452", "comment_id": "3", "title": "Neutral",
|
||||||
|
"text": "No opinion.", "date": "2021-01-06T11:00:00", "author": "Carol"},
|
||||||
|
]
|
||||||
|
|
||||||
|
ANALYSIS_ROWS = [
|
||||||
|
{"run_id": "r1", "forum_id": "452", "comment_id": "1", "input_title": "Support it",
|
||||||
|
"analyzed_at": "2026-05-06T12:00:00+00:00", "model": "gpt-5.4-mini",
|
||||||
|
"prompt_version": "abc1234", "stance": "support", "stance_confidence": 0.95,
|
||||||
|
"stance_rationale": "Commenter says 'I support'.", "tone": "positive",
|
||||||
|
"tags": ["student safety"], "truncated": False, "error": None},
|
||||||
|
{"run_id": "r1", "forum_id": "452", "comment_id": "2", "input_title": "Oppose it",
|
||||||
|
"analyzed_at": "2026-05-06T12:00:00+00:00", "model": "gpt-5.4-mini",
|
||||||
|
"prompt_version": "abc1234", "stance": "oppose", "stance_confidence": 0.90,
|
||||||
|
"stance_rationale": "Commenter says 'I oppose'.", "tone": "negative",
|
||||||
|
"tags": [], "truncated": False, "error": None},
|
||||||
|
]
|
||||||
|
|
||||||
|
FORUM_ROW = {"forum_id": "452", "reg_title": "Policy X", "reg_desc": "Guidance on Y."}
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def raw_jsonl(tmp_path) -> Path:
|
||||||
|
p = tmp_path / "f452.jsonl"
|
||||||
|
_write_jsonl(p, [FORUM_ROW] + RAW_ROWS)
|
||||||
|
return p
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def jobs_dir(tmp_path) -> Path:
|
||||||
|
d = tmp_path / "jobs" / "f452-1"
|
||||||
|
d.mkdir(parents=True)
|
||||||
|
_write_jsonl(d / "job1-output.jsonl", ANALYSIS_ROWS)
|
||||||
|
return d
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# load_raw
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_raw_returns_only_comments(raw_jsonl):
|
||||||
|
df = vl.load_raw(raw_jsonl)
|
||||||
|
assert len(df) == 3
|
||||||
|
assert set(df.columns) == set(vl.RAW_COLS)
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_raw_correct_columns(raw_jsonl):
|
||||||
|
df = vl.load_raw(raw_jsonl)
|
||||||
|
for col in vl.RAW_COLS:
|
||||||
|
assert col in df.columns
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_raw_skips_forum_item(raw_jsonl):
|
||||||
|
df = vl.load_raw(raw_jsonl)
|
||||||
|
assert "reg_title" not in df.columns
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# load_analysis
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_analysis_skips_raw_files(tmp_path):
|
||||||
|
d = tmp_path / "jobs" / "f452-1"
|
||||||
|
d.mkdir(parents=True)
|
||||||
|
_write_jsonl(d / "job1-output-raw.jsonl", ANALYSIS_ROWS) # should be ignored
|
||||||
|
_write_jsonl(d / "job1-output.jsonl", ANALYSIS_ROWS)
|
||||||
|
df = vl.load_analysis(d)
|
||||||
|
assert len(df) == len(ANALYSIS_ROWS)
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_analysis_concatenates_multiple_files(tmp_path):
|
||||||
|
d = tmp_path / "jobs" / "f452-1"
|
||||||
|
d.mkdir(parents=True)
|
||||||
|
_write_jsonl(d / "job1-output.jsonl", [ANALYSIS_ROWS[0]])
|
||||||
|
_write_jsonl(d / "job2-output.jsonl", [ANALYSIS_ROWS[1]])
|
||||||
|
df = vl.load_analysis(d)
|
||||||
|
assert len(df) == 2
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_analysis_tags_serialized_as_json(jobs_dir):
|
||||||
|
df = vl.load_analysis(jobs_dir)
|
||||||
|
tags_val = df.loc[df["comment_id"] == "1", "tags"].iloc[0]
|
||||||
|
assert isinstance(tags_val, str)
|
||||||
|
assert json.loads(tags_val) == ["student safety"]
|
||||||
|
|
||||||
|
|
||||||
|
def test_load_analysis_empty_tags_serialized(jobs_dir):
|
||||||
|
df = vl.load_analysis(jobs_dir)
|
||||||
|
tags_val = df.loc[df["comment_id"] == "2", "tags"].iloc[0]
|
||||||
|
assert json.loads(tags_val) == []
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# join — by comment_id, not index
|
||||||
|
|
||||||
|
|
||||||
|
def test_join_by_comment_id_not_index(raw_jsonl, jobs_dir):
|
||||||
|
raw = vl.load_raw(raw_jsonl)
|
||||||
|
analysis = vl.load_analysis(jobs_dir)
|
||||||
|
# Shuffle raw order so comment_id ordering differs from index
|
||||||
|
raw = raw.sample(frac=1, random_state=42).reset_index(drop=True)
|
||||||
|
merged = vl.join(raw, analysis)
|
||||||
|
row_1 = merged[merged["comment_id"] == "1"].iloc[0]
|
||||||
|
assert row_1["stance"] == "support"
|
||||||
|
assert row_1["author"] == "Alice"
|
||||||
|
|
||||||
|
|
||||||
|
def test_join_unanalyzed_comment_has_null_stance(raw_jsonl, jobs_dir):
|
||||||
|
"""Comment 3 is in raw but not in analysis — stance should be NaN."""
|
||||||
|
raw = vl.load_raw(raw_jsonl)
|
||||||
|
analysis = vl.load_analysis(jobs_dir)
|
||||||
|
merged = vl.join(raw, analysis)
|
||||||
|
row_3 = merged[merged["comment_id"] == "3"].iloc[0]
|
||||||
|
assert pd.isna(row_3["stance"])
|
||||||
|
|
||||||
|
|
||||||
|
def test_join_preserves_all_raw_comments(raw_jsonl, jobs_dir):
|
||||||
|
raw = vl.load_raw(raw_jsonl)
|
||||||
|
analysis = vl.load_analysis(jobs_dir)
|
||||||
|
merged = vl.join(raw, analysis)
|
||||||
|
assert len(merged) == len(raw)
|
||||||
|
|
||||||
|
|
||||||
|
def test_join_output_columns_in_order(raw_jsonl, jobs_dir):
|
||||||
|
raw = vl.load_raw(raw_jsonl)
|
||||||
|
analysis = vl.load_analysis(jobs_dir)
|
||||||
|
merged = vl.join(raw, analysis)
|
||||||
|
assert list(merged.columns) == vl.OUTPUT_COLS
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Duplicate comment_id handling
|
||||||
|
|
||||||
|
|
||||||
|
def test_duplicate_raw_id_flagged(raw_jsonl, jobs_dir):
|
||||||
|
raw = vl.load_raw(raw_jsonl)
|
||||||
|
# Manually duplicate a row
|
||||||
|
raw = pd.concat([raw, raw.iloc[[0]]], ignore_index=True)
|
||||||
|
analysis = vl.load_analysis(jobs_dir)
|
||||||
|
merged = vl.join(raw, analysis)
|
||||||
|
# join still produces a row for each raw row (left join)
|
||||||
|
assert len(merged) == len(raw)
|
||||||
|
assert raw["comment_id"].duplicated().sum() == 1
|
||||||
|
|
||||||
|
|
||||||
|
def test_duplicate_analysis_id_produces_extra_rows(raw_jsonl, tmp_path):
|
||||||
|
"""Two analysis records for the same comment_id create two joined rows."""
|
||||||
|
d = tmp_path / "jobs" / "f452-dup"
|
||||||
|
d.mkdir(parents=True)
|
||||||
|
dup_rows = [ANALYSIS_ROWS[0], {**ANALYSIS_ROWS[0], "stance": "oppose"}]
|
||||||
|
_write_jsonl(d / "job1-output.jsonl", dup_rows)
|
||||||
|
raw = vl.load_raw(raw_jsonl)
|
||||||
|
analysis = vl.load_analysis(d)
|
||||||
|
merged = vl.join(raw, analysis)
|
||||||
|
assert len(merged[merged["comment_id"] == "1"]) == 2
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Validation counts (smoke test — just confirm it runs without error)
|
||||||
|
|
||||||
|
|
||||||
|
def test_print_validation_runs(raw_jsonl, jobs_dir, capsys):
|
||||||
|
raw = vl.load_raw(raw_jsonl)
|
||||||
|
analysis = vl.load_analysis(jobs_dir)
|
||||||
|
merged = vl.join(raw, analysis)
|
||||||
|
vl.print_validation(raw, analysis, merged)
|
||||||
|
out = capsys.readouterr().out
|
||||||
|
assert "Raw comments" in out
|
||||||
|
assert "Stance counts" in out
|
||||||
|
assert "Tone counts" in out
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# CSV output
|
||||||
|
|
||||||
|
|
||||||
|
def test_csv_written_to_jobs_dir(raw_jsonl, jobs_dir, tmp_path):
|
||||||
|
raw = vl.load_raw(raw_jsonl)
|
||||||
|
analysis = vl.load_analysis(jobs_dir)
|
||||||
|
merged = vl.join(raw, analysis)
|
||||||
|
out_path = jobs_dir / "review.csv"
|
||||||
|
merged.to_csv(out_path, index=False, encoding="utf-8-sig")
|
||||||
|
assert out_path.exists()
|
||||||
|
loaded = pd.read_csv(out_path, encoding="utf-8-sig")
|
||||||
|
assert list(loaded.columns) == vl.OUTPUT_COLS
|
||||||
|
assert len(loaded) == len(raw)
|
||||||
Reference in New Issue
Block a user