55 lines
1.4 KiB
Python
55 lines
1.4 KiB
Python
import csv
|
|
import hashlib
|
|
from collections import Counter
|
|
from pathlib import Path
|
|
|
|
|
|
def read_csv_rows(path):
|
|
path = Path(path)
|
|
with path.open(newline="", encoding="utf-8") as handle:
|
|
return list(csv.DictReader(handle))
|
|
|
|
|
|
def write_csv_rows(path, rows, fieldnames):
|
|
path = Path(path)
|
|
path.parent.mkdir(parents=True, exist_ok=True)
|
|
with path.open("w", newline="", encoding="utf-8") as handle:
|
|
writer = csv.DictWriter(handle, fieldnames=fieldnames)
|
|
writer.writeheader()
|
|
writer.writerows(rows)
|
|
|
|
|
|
def stable_id(prefix, raw_key):
|
|
digest = hashlib.sha1(str(raw_key).encode("utf-8")).hexdigest()[:12]
|
|
return f"{prefix}_{digest}"
|
|
|
|
|
|
def first_nonblank(rows, field):
|
|
for row in rows:
|
|
value = row.get(field, "")
|
|
if value:
|
|
return value
|
|
return ""
|
|
|
|
|
|
def representative_value(rows, field):
|
|
values = [row.get(field, "") for row in rows if row.get(field, "")]
|
|
if not values:
|
|
return ""
|
|
counts = Counter(values)
|
|
return sorted(counts.items(), key=lambda item: (-item[1], item[0]))[0][0]
|
|
|
|
|
|
def distinct_values(rows, field):
|
|
return sorted({row.get(field, "") for row in rows if row.get(field, "")})
|
|
|
|
|
|
def compact_join(values, limit=3):
|
|
unique = []
|
|
seen = set()
|
|
for value in values:
|
|
if value and value not in seen:
|
|
seen.add(value)
|
|
unique.append(value)
|
|
return " | ".join(unique[:limit])
|