77 lines
2.8 KiB
Python
77 lines
2.8 KiB
Python
#!/usr/bin/env python3
|
|
"""analysis/create_csv.py — join raw scrape with analysis output for review."""
|
|
|
|
import argparse
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
|
|
RAW_COLS = ["forum_id", "comment_id", "title", "text", "date", "author"]
|
|
ANALYSIS_COLS = [
|
|
"stance", "stance_confidence", "stance_rationale", "tone", "tags",
|
|
"error", "truncated", "analyzed_at", "prompt_version", "model",
|
|
]
|
|
OUTPUT_COLS = RAW_COLS + ANALYSIS_COLS
|
|
|
|
|
|
def load_raw(path: Path) -> pd.DataFrame:
|
|
df = pd.read_json(path, lines=True)
|
|
df = df[df["comment_id"].notna()] # rm first item (forum, not comment)
|
|
for col in RAW_COLS:
|
|
if col not in df.columns:
|
|
df[col] = None
|
|
return df[RAW_COLS].copy()
|
|
|
|
|
|
def load_analysis(jobs_dir: Path) -> pd.DataFrame:
|
|
files = sorted(p for p in jobs_dir.glob("job*-output.jsonl") if "-raw" not in p.name)
|
|
df = pd.concat([pd.read_json(p, lines=True) for p in files], ignore_index=True)
|
|
for col in ANALYSIS_COLS:
|
|
if col not in df.columns:
|
|
df[col] = None
|
|
return df[["comment_id"] + ANALYSIS_COLS].copy()
|
|
|
|
|
|
def join(raw: pd.DataFrame, analysis: pd.DataFrame) -> pd.DataFrame:
|
|
return raw.merge(analysis, on="comment_id", how="left")[OUTPUT_COLS]
|
|
|
|
|
|
def print_counts(raw: pd.DataFrame, analysis: pd.DataFrame, merged: pd.DataFrame) -> None:
|
|
print(f"\nRaw comments : {len(raw):,}")
|
|
print(f"Analyzed : {len(analysis):,}")
|
|
print(f"Joined : {merged['stance'].notna().sum():,}")
|
|
print(f"Unanalyzed : {merged['stance'].isna().sum():,}")
|
|
print(f"Errors : {analysis['error'].notna().sum():,}")
|
|
print(f"Dup IDs (raw) : {raw['comment_id'].duplicated().sum():,}")
|
|
print(f"\nStance:\n{analysis['stance'].value_counts(dropna=False).to_string()}")
|
|
print(f"\nTone:\n{analysis['tone'].value_counts(dropna=False).to_string()}\n")
|
|
|
|
|
|
def main() -> None:
|
|
p = argparse.ArgumentParser(
|
|
description="Join raw scrape JSONL with analysis output; write review CSV."
|
|
)
|
|
p.add_argument("input", help="Raw scrape JSONL (e.g. output/f452.jsonl)")
|
|
p.add_argument("jobs_dir", help="Job directory containing job*-output.jsonl files")
|
|
p.add_argument("--parquet", action="store_true", help="Also write review.parquet")
|
|
p.add_argument("--out", default=None, help="Output CSV path (default: <jobs_dir>/review.csv)")
|
|
args = p.parse_args()
|
|
|
|
raw = load_raw(Path(args.input))
|
|
analysis = load_analysis(Path(args.jobs_dir))
|
|
merged = join(raw, analysis)
|
|
print_counts(raw, analysis, merged)
|
|
|
|
out = Path(args.out) if args.out else Path(args.jobs_dir) / "review.csv"
|
|
merged.to_csv(out, index=False, encoding="utf-8-sig")
|
|
print(f"CSV → {out}")
|
|
|
|
if args.parquet:
|
|
pq = out.with_suffix(".parquet")
|
|
merged.to_parquet(pq, index=False)
|
|
print(f"Parquet → {pq}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|