scrape-giant/layer_helpers.py

import csv
import hashlib
from collections import Counter
from pathlib import Path


def read_csv_rows(path):
    path = Path(path)
    with path.open(newline="", encoding="utf-8") as handle:
        return list(csv.DictReader(handle))


def write_csv_rows(path, rows, fieldnames):
    path = Path(path)
    path.parent.mkdir(parents=True, exist_ok=True)
    with path.open("w", newline="", encoding="utf-8") as handle:
        writer = csv.DictWriter(handle, fieldnames=fieldnames)
        writer.writeheader()
        writer.writerows(rows)


def stable_id(prefix, raw_key):
    digest = hashlib.sha1(str(raw_key).encode("utf-8")).hexdigest()[:12]
    return f"{prefix}_{digest}"


def first_nonblank(rows, field):
    for row in rows:
        value = row.get(field, "")
        if value:
            return value
    return ""


def representative_value(rows, field):
    values = [row.get(field, "") for row in rows if row.get(field, "")]
    if not values:
        return ""
    counts = Counter(values)
    return sorted(counts.items(), key=lambda item: (-item[1], item[0]))[0][0]


def distinct_values(rows, field):
    return sorted({row.get(field, "") for row in rows if row.get(field, "")})


def compact_join(values, limit=3):
    unique = []
    seen = set()
    for value in values:
        if value and value not in seen:
            seen.add(value)
            unique.append(value)
    return " | ".join(unique[:limit])