added create_csv.py
This commit is contained in:
76
analysis/create_csv.py
Normal file
76
analysis/create_csv.py
Normal file
@@ -0,0 +1,76 @@
|
||||
#!/usr/bin/env python3
|
||||
"""analysis/create_csv.py — join raw scrape with analysis output for review."""
|
||||
|
||||
import argparse
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
RAW_COLS = ["forum_id", "comment_id", "title", "text", "date", "author"]
|
||||
ANALYSIS_COLS = [
|
||||
"stance", "stance_confidence", "stance_rationale", "tone", "tags",
|
||||
"error", "truncated", "analyzed_at", "prompt_version", "model",
|
||||
]
|
||||
OUTPUT_COLS = RAW_COLS + ANALYSIS_COLS
|
||||
|
||||
|
||||
def load_raw(path: Path) -> pd.DataFrame:
|
||||
df = pd.read_json(path, lines=True)
|
||||
df = df[df["comment_id"].notna()] # rm first item (forum, not comment)
|
||||
for col in RAW_COLS:
|
||||
if col not in df.columns:
|
||||
df[col] = None
|
||||
return df[RAW_COLS].copy()
|
||||
|
||||
|
||||
def load_analysis(jobs_dir: Path) -> pd.DataFrame:
|
||||
files = sorted(p for p in jobs_dir.glob("job*-output.jsonl") if "-raw" not in p.name)
|
||||
df = pd.concat([pd.read_json(p, lines=True) for p in files], ignore_index=True)
|
||||
for col in ANALYSIS_COLS:
|
||||
if col not in df.columns:
|
||||
df[col] = None
|
||||
return df[["comment_id"] + ANALYSIS_COLS].copy()
|
||||
|
||||
|
||||
def join(raw: pd.DataFrame, analysis: pd.DataFrame) -> pd.DataFrame:
|
||||
return raw.merge(analysis, on="comment_id", how="left")[OUTPUT_COLS]
|
||||
|
||||
|
||||
def print_counts(raw: pd.DataFrame, analysis: pd.DataFrame, merged: pd.DataFrame) -> None:
|
||||
print(f"\nRaw comments : {len(raw):,}")
|
||||
print(f"Analyzed : {len(analysis):,}")
|
||||
print(f"Joined : {merged['stance'].notna().sum():,}")
|
||||
print(f"Unanalyzed : {merged['stance'].isna().sum():,}")
|
||||
print(f"Errors : {analysis['error'].notna().sum():,}")
|
||||
print(f"Dup IDs (raw) : {raw['comment_id'].duplicated().sum():,}")
|
||||
print(f"\nStance:\n{analysis['stance'].value_counts(dropna=False).to_string()}")
|
||||
print(f"\nTone:\n{analysis['tone'].value_counts(dropna=False).to_string()}\n")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
p = argparse.ArgumentParser(
|
||||
description="Join raw scrape JSONL with analysis output; write review CSV."
|
||||
)
|
||||
p.add_argument("input", help="Raw scrape JSONL (e.g. output/f452.jsonl)")
|
||||
p.add_argument("jobs_dir", help="Job directory containing job*-output.jsonl files")
|
||||
p.add_argument("--parquet", action="store_true", help="Also write review.parquet")
|
||||
p.add_argument("--out", default=None, help="Output CSV path (default: <jobs_dir>/review.csv)")
|
||||
args = p.parse_args()
|
||||
|
||||
raw = load_raw(Path(args.input))
|
||||
analysis = load_analysis(Path(args.jobs_dir))
|
||||
merged = join(raw, analysis)
|
||||
print_counts(raw, analysis, merged)
|
||||
|
||||
out = Path(args.out) if args.out else Path(args.jobs_dir) / "review.csv"
|
||||
merged.to_csv(out, index=False, encoding="utf-8-sig")
|
||||
print(f"CSV → {out}")
|
||||
|
||||
if args.parquet:
|
||||
pq = out.with_suffix(".parquet")
|
||||
merged.to_parquet(pq, index=False)
|
||||
print(f"Parquet → {pq}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user