import csv import hashlib from collections import Counter from pathlib import Path def read_csv_rows(path): path = Path(path) with path.open(newline="", encoding="utf-8") as handle: return list(csv.DictReader(handle)) def write_csv_rows(path, rows, fieldnames): path = Path(path) path.parent.mkdir(parents=True, exist_ok=True) with path.open("w", newline="", encoding="utf-8") as handle: writer = csv.DictWriter(handle, fieldnames=fieldnames) writer.writeheader() writer.writerows(rows) def stable_id(prefix, raw_key): digest = hashlib.sha1(str(raw_key).encode("utf-8")).hexdigest()[:12] return f"{prefix}_{digest}" def first_nonblank(rows, field): for row in rows: value = row.get(field, "") if value: return value return "" def representative_value(rows, field): values = [row.get(field, "") for row in rows if row.get(field, "")] if not values: return "" counts = Counter(values) return sorted(counts.items(), key=lambda item: (-item[1], item[0]))[0][0] def distinct_values(rows, field): return sorted({row.get(field, "") for row in rows if row.get(field, "")}) def compact_join(values, limit=3): unique = [] seen = set() for value in values: if value and value not in seen: seen.add(value) unique.append(value) return " | ".join(unique[:limit])