diff --git a/analysis/encoding.py b/analysis/encoding.py new file mode 100644 index 0000000..efc96a5 --- /dev/null +++ b/analysis/encoding.py @@ -0,0 +1,74 @@ +""" +analysis/encoding.py — text encoding repair for scraped content. + +The townhall.virginia.gov scraper forces UTF-8 decoding, which is correct for the +site's current content. This module provides a defensive repair function for cases +where a response arrives with Windows-1252/cp1252 bytes embedded in otherwise UTF-8 +content (common in older CMSes). The raw scrape files are never modified; repair is +applied at the analysis and reporting layers only. + +Primary: uses `ftfy` when installed (pip install ftfy). +Fallback: re-encodes as cp1252, decodes as UTF-8 (pure mojibake strings only), +then applies a table of known-bad patterns for mixed-encoding strings. +""" + +# --------------------------------------------------------------------------- +# Known patterns: UTF-8 bytes decoded as cp1252, i.e. the 3-char sequences you +# see when a server sends e.g. E2 80 99 and it gets decoded as cp1252 chars. +# +# Byte → cp1252 char mappings for the 0x80–0x9F range: +# E2 → â (U+00E2, always) +# 80 → € (U+20AC, cp1252 0x80) +# 99 → ™ (U+2122, cp1252 0x99) ← E2 80 99 = U+2019 ' right single quote +# 98 → ˜ (U+02DC, cp1252 0x98) ← E2 80 98 = U+2018 ' left single quote +# 9C → œ (U+0153, cp1252 0x9C) ← E2 80 9C = U+201C " left double quote +# 9D → \x9d (undefined → U+009D) ← E2 80 9D = U+201D " right double quote +# 93 → " (U+201C, cp1252 0x93) ← E2 80 93 = U+2013 – en dash +# 94 → " (U+201D, cp1252 0x94) ← E2 80 94 = U+2014 — em dash +# A6 → ¦ (U+00A6, cp1252 0xA6) ← E2 80 A6 = U+2026 … ellipsis + +_KNOWN_REPAIRS: list[tuple[str, str]] = [ + # Longer / more specific patterns first to avoid partial matches + ("’", "’"), # ’ → ' right single quote + ("‘", "‘"), # ‘ → ' left single quote + ("“", "“"), # “ → " left double quote + ("”", "”"), # â€\x9d → " right double quote + ("–", "–"), # â€" (with left DQ) → – en dash + ("—", "—"), # â€" (with right DQ) → — em dash + ("…", "…"), # … → … ellipsis + # Generic fallback: bare †prefix not caught above → remove artifact + ("â€", ""), +] + + +def repair_text(text: str) -> str: + """Repair common encoding artifacts in scraped text. + + Handles: + - UTF-8 bytes decoded as cp1252/Latin-1 (’ → ') + - Attempts best-effort cleanup for mixed-encoding strings + + U+FFFD replacement characters (from strict UTF-8 decoding of cp1252 bytes) + cannot be recovered since the original byte is lost; they are left as-is. + """ + if not text: + return text + + try: + import ftfy + return ftfy.fix_text(text) + except ImportError: + pass + + # Fallback 1: pure mojibake — entire string is UTF-8 bytes read as cp1252. + # Re-encode as cp1252 and decode as UTF-8. + try: + return text.encode("cp1252").decode("utf-8") + except (UnicodeEncodeError, UnicodeDecodeError): + pass + + # Fallback 2: mixed strings — substitute known-bad patterns. + for bad, good in _KNOWN_REPAIRS: + if bad in text: + text = text.replace(bad, good) + return text diff --git a/tests/encoding.py b/tests/encoding.py new file mode 100644 index 0000000..d95b298 --- /dev/null +++ b/tests/encoding.py @@ -0,0 +1,119 @@ +"""Unit tests for analysis/encoding.py — no external dependencies required.""" + +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).parent.parent / "analysis")) +from encoding import repair_text, _KNOWN_REPAIRS + + +# --------------------------------------------------------------------------- +# Core contract + + +def test_empty_string_unchanged(): + assert repair_text("") == "" + + +def test_none_like_empty_unchanged(): + assert repair_text("") == "" + + +def test_clean_ascii_unchanged(): + text = "This is a normal sentence with no encoding issues." + assert repair_text(text) == text + + +def test_clean_unicode_unchanged(): + text = "Café, naïve, résumé — proper Unicode already." + result = repair_text(text) + # Should either be unchanged or equivalently correct + assert "Caf" in result and "na" in result + + +# --------------------------------------------------------------------------- +# Known mojibake sequences (tasks.org AC4) +# These are the 5 patterns explicitly listed in the acceptance criteria. + + +def test_right_single_quote(): + """’ → ' (U+2019 right single quotation mark)""" + assert repair_text("Virginia’s") == "Virginia’s" + + +def test_left_double_quote(): + """“ → " (U+201C left double quotation mark)""" + assert repair_text("“Hello") == "“Hello" + + +def test_en_dash(): + """â€" (where last char is U+201C) → – (U+2013 en dash)""" + result = repair_text("pages 1–5") + assert "–" in result or "—" in result or "-" in result + + +def test_em_dash(): + """â€" (where last char is U+201D) → — (U+2014 em dash)""" + result = repair_text("word—word") + assert "—" in result or "–" in result or "-" in result + + +def test_right_double_quote(): + """â€\x9d → " (U+201D right double quotation mark)""" + result = repair_text("said” he") + # Should not contain the raw artifact + assert "â€" not in result + + +# --------------------------------------------------------------------------- +# Round-trip: garbled text produces sensible output + + +def test_garbled_sentence_repaired(): + """A sentence with multiple mojibake chars is repaired to readable text.""" + # "Don't" with right single quote encoded as UTF-8, then decoded as cp1252 + # D o n ' t → D o n ’ t + garbled = "Don’t worry" + result = repair_text(garbled) + assert "Don" in result and "t worry" in result + assert "â€" not in result # artifact gone + + +def test_clean_string_after_repair_has_no_artifacts(): + garbled = "She said “Hello” and left." + result = repair_text(garbled) + assert "â€" not in result + + +# --------------------------------------------------------------------------- +# FFFD replacement characters (from strict UTF-8 decode of cp1252 bytes) + + +def test_fffd_preserved_not_crashed(): + """repair_text must not raise on U+FFFD; it may or may not repair it.""" + text = "Virginia�s Public Schools" + result = repair_text(text) + assert isinstance(result, str) + assert "Virginia" in result + + +# --------------------------------------------------------------------------- +# _KNOWN_REPAIRS table structure + + +def test_known_repairs_non_empty(): + assert len(_KNOWN_REPAIRS) > 0 + + +def test_known_repairs_are_pairs(): + for item in _KNOWN_REPAIRS: + assert len(item) == 2 + bad, good = item + assert isinstance(bad, str) and isinstance(good, str) + + +def test_known_repairs_bad_not_equal_good(): + for bad, good in _KNOWN_REPAIRS: + assert bad != good