added texts and fixes for mojibake

2026-05-07 17:22:16 -04:00
parent 28d6d222bd
commit 1ea696d818
2 changed files with 193 additions and 0 deletions
--- a/analysis/encoding.py
+++ b/analysis/encoding.py
@@ -0,0 +1,74 @@
+"""
+analysis/encoding.py — text encoding repair for scraped content.
+
+The townhall.virginia.gov scraper forces UTF-8 decoding, which is correct for the
+site's current content. This module provides a defensive repair function for cases
+where a response arrives with Windows-1252/cp1252 bytes embedded in otherwise UTF-8
+content (common in older CMSes). The raw scrape files are never modified; repair is
+applied at the analysis and reporting layers only.
+
+Primary: uses `ftfy` when installed (pip install ftfy).
+Fallback: re-encodes as cp1252, decodes as UTF-8 (pure mojibake strings only),
+then applies a table of known-bad patterns for mixed-encoding strings.
+"""
+
+# ---------------------------------------------------------------------------
+# Known patterns: UTF-8 bytes decoded as cp1252, i.e. the 3-char sequences you
+# see when a server sends e.g. E2 80 99 and it gets decoded as cp1252 chars.
+#
+# Byte → cp1252 char mappings for the 0x80–0x9F range:
+#   E2 → â  (U+00E2, always)
+#   80 → €  (U+20AC, cp1252 0x80)
+#   99 → ™  (U+2122, cp1252 0x99)  ← E2 80 99 = U+2019 ' right single quote
+#   98 → ˜  (U+02DC, cp1252 0x98)  ← E2 80 98 = U+2018 ' left single quote
+#   9C → œ  (U+0153, cp1252 0x9C)  ← E2 80 9C = U+201C " left double quote
+#   9D → \x9d (undefined → U+009D) ← E2 80 9D = U+201D " right double quote
+#   93 → "  (U+201C, cp1252 0x93)  ← E2 80 93 = U+2013 – en dash
+#   94 → "  (U+201D, cp1252 0x94)  ← E2 80 94 = U+2014 — em dash
+#   A6 → ¦  (U+00A6, cp1252 0xA6)  ← E2 80 A6 = U+2026 … ellipsis
+
+_KNOWN_REPAIRS: list[tuple[str, str]] = [
+    # Longer / more specific patterns first to avoid partial matches
+    ("â€™",  "’"),  # â€™ → ' right single quote
+    ("â€˜",  "‘"),  # â€˜ → ' left single quote
+    ("â€œ",  "“"),  # â€œ → " left double quote
+    ("â€",  "”"),  # â€\x9d → " right double quote
+    ("â€“",  "–"),  # â€" (with left DQ) → – en dash
+    ("â€”",  "—"),  # â€" (with right DQ) → — em dash
+    ("â€¦",  "…"),  # â€¦ → … ellipsis
+    # Generic fallback: bare â€ prefix not caught above → remove artifact
+    ("â€",        ""),
+]
+
+
+def repair_text(text: str) -> str:
+    """Repair common encoding artifacts in scraped text.
+
+    Handles:
+    - UTF-8 bytes decoded as cp1252/Latin-1 (â€™ → ')
+    - Attempts best-effort cleanup for mixed-encoding strings
+
+    U+FFFD replacement characters (from strict UTF-8 decoding of cp1252 bytes)
+    cannot be recovered since the original byte is lost; they are left as-is.
+    """
+    if not text:
+        return text
+
+    try:
+        import ftfy
+        return ftfy.fix_text(text)
+    except ImportError:
+        pass
+
+    # Fallback 1: pure mojibake — entire string is UTF-8 bytes read as cp1252.
+    # Re-encode as cp1252 and decode as UTF-8.
+    try:
+        return text.encode("cp1252").decode("utf-8")
+    except (UnicodeEncodeError, UnicodeDecodeError):
+        pass
+
+    # Fallback 2: mixed strings — substitute known-bad patterns.
+    for bad, good in _KNOWN_REPAIRS:
+        if bad in text:
+            text = text.replace(bad, good)
+    return text
--- a/tests/encoding.py
+++ b/tests/encoding.py
@@ -0,0 +1,119 @@
+"""Unit tests for analysis/encoding.py — no external dependencies required."""
+
+import sys
+from pathlib import Path
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).parent.parent / "analysis"))
+from encoding import repair_text, _KNOWN_REPAIRS
+
+
+# ---------------------------------------------------------------------------
+# Core contract
+
+
+def test_empty_string_unchanged():
+    assert repair_text("") == ""
+
+
+def test_none_like_empty_unchanged():
+    assert repair_text("") == ""
+
+
+def test_clean_ascii_unchanged():
+    text = "This is a normal sentence with no encoding issues."
+    assert repair_text(text) == text
+
+
+def test_clean_unicode_unchanged():
+    text = "Café, naïve, résumé — proper Unicode already."
+    result = repair_text(text)
+    # Should either be unchanged or equivalently correct
+    assert "Caf" in result and "na" in result
+
+
+# ---------------------------------------------------------------------------
+# Known mojibake sequences (tasks.org AC4)
+# These are the 5 patterns explicitly listed in the acceptance criteria.
+
+
+def test_right_single_quote():
+    """â€™ → ' (U+2019 right single quotation mark)"""
+    assert repair_text("Virginiaâ€™s") == "Virginia’s"
+
+
+def test_left_double_quote():
+    """â€œ → " (U+201C left double quotation mark)"""
+    assert repair_text("â€œHello") == "“Hello"
+
+
+def test_en_dash():
+    """â€" (where last char is U+201C) → – (U+2013 en dash)"""
+    result = repair_text("pages 1â€“5")
+    assert "–" in result or "—" in result or "-" in result
+
+
+def test_em_dash():
+    """â€" (where last char is U+201D) → — (U+2014 em dash)"""
+    result = repair_text("wordâ€”word")
+    assert "—" in result or "–" in result or "-" in result
+
+
+def test_right_double_quote():
+    """â€\x9d → " (U+201D right double quotation mark)"""
+    result = repair_text("saidâ€ he")
+    # Should not contain the raw artifact
+    assert "â€" not in result
+
+
+# ---------------------------------------------------------------------------
+# Round-trip: garbled text produces sensible output
+
+
+def test_garbled_sentence_repaired():
+    """A sentence with multiple mojibake chars is repaired to readable text."""
+    # "Don't" with right single quote encoded as UTF-8, then decoded as cp1252
+    # D o n ' t  →  D o n â€™ t
+    garbled = "Donâ€™t worry"
+    result = repair_text(garbled)
+    assert "Don" in result and "t worry" in result
+    assert "â€" not in result  # artifact gone
+
+
+def test_clean_string_after_repair_has_no_artifacts():
+    garbled = "She said â€œHelloâ€ and left."
+    result = repair_text(garbled)
+    assert "â€" not in result
+
+
+# ---------------------------------------------------------------------------
+# FFFD replacement characters (from strict UTF-8 decode of cp1252 bytes)
+
+
+def test_fffd_preserved_not_crashed():
+    """repair_text must not raise on U+FFFD; it may or may not repair it."""
+    text = "Virginia<EFBFBD>s Public Schools"
+    result = repair_text(text)
+    assert isinstance(result, str)
+    assert "Virginia" in result
+
+
+# ---------------------------------------------------------------------------
+# _KNOWN_REPAIRS table structure
+
+
+def test_known_repairs_non_empty():
+    assert len(_KNOWN_REPAIRS) > 0
+
+
+def test_known_repairs_are_pairs():
+    for item in _KNOWN_REPAIRS:
+        assert len(item) == 2
+        bad, good = item
+        assert isinstance(bad, str) and isinstance(good, str)
+
+
+def test_known_repairs_bad_not_equal_good():
+    for bad, good in _KNOWN_REPAIRS:
+        assert bad != good