added texts and fixes for mojibake
This commit is contained in:
74
analysis/encoding.py
Normal file
74
analysis/encoding.py
Normal file
@@ -0,0 +1,74 @@
|
||||
"""
|
||||
analysis/encoding.py — text encoding repair for scraped content.
|
||||
|
||||
The townhall.virginia.gov scraper forces UTF-8 decoding, which is correct for the
|
||||
site's current content. This module provides a defensive repair function for cases
|
||||
where a response arrives with Windows-1252/cp1252 bytes embedded in otherwise UTF-8
|
||||
content (common in older CMSes). The raw scrape files are never modified; repair is
|
||||
applied at the analysis and reporting layers only.
|
||||
|
||||
Primary: uses `ftfy` when installed (pip install ftfy).
|
||||
Fallback: re-encodes as cp1252, decodes as UTF-8 (pure mojibake strings only),
|
||||
then applies a table of known-bad patterns for mixed-encoding strings.
|
||||
"""
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Known patterns: UTF-8 bytes decoded as cp1252, i.e. the 3-char sequences you
|
||||
# see when a server sends e.g. E2 80 99 and it gets decoded as cp1252 chars.
|
||||
#
|
||||
# Byte → cp1252 char mappings for the 0x80–0x9F range:
|
||||
# E2 → â (U+00E2, always)
|
||||
# 80 → € (U+20AC, cp1252 0x80)
|
||||
# 99 → ™ (U+2122, cp1252 0x99) ← E2 80 99 = U+2019 ' right single quote
|
||||
# 98 → ˜ (U+02DC, cp1252 0x98) ← E2 80 98 = U+2018 ' left single quote
|
||||
# 9C → œ (U+0153, cp1252 0x9C) ← E2 80 9C = U+201C " left double quote
|
||||
# 9D → \x9d (undefined → U+009D) ← E2 80 9D = U+201D " right double quote
|
||||
# 93 → " (U+201C, cp1252 0x93) ← E2 80 93 = U+2013 – en dash
|
||||
# 94 → " (U+201D, cp1252 0x94) ← E2 80 94 = U+2014 — em dash
|
||||
# A6 → ¦ (U+00A6, cp1252 0xA6) ← E2 80 A6 = U+2026 … ellipsis
|
||||
|
||||
_KNOWN_REPAIRS: list[tuple[str, str]] = [
|
||||
# Longer / more specific patterns first to avoid partial matches
|
||||
("’", "’"), # ’ → ' right single quote
|
||||
("‘", "‘"), # ‘ → ' left single quote
|
||||
("“", "“"), # “ → " left double quote
|
||||
("â€", "”"), # â€\x9d → " right double quote
|
||||
("–", "–"), # â€" (with left DQ) → – en dash
|
||||
("—", "—"), # â€" (with right DQ) → — em dash
|
||||
("…", "…"), # … → … ellipsis
|
||||
# Generic fallback: bare †prefix not caught above → remove artifact
|
||||
("â€", ""),
|
||||
]
|
||||
|
||||
|
||||
def repair_text(text: str) -> str:
|
||||
"""Repair common encoding artifacts in scraped text.
|
||||
|
||||
Handles:
|
||||
- UTF-8 bytes decoded as cp1252/Latin-1 (’ → ')
|
||||
- Attempts best-effort cleanup for mixed-encoding strings
|
||||
|
||||
U+FFFD replacement characters (from strict UTF-8 decoding of cp1252 bytes)
|
||||
cannot be recovered since the original byte is lost; they are left as-is.
|
||||
"""
|
||||
if not text:
|
||||
return text
|
||||
|
||||
try:
|
||||
import ftfy
|
||||
return ftfy.fix_text(text)
|
||||
except ImportError:
|
||||
pass
|
||||
|
||||
# Fallback 1: pure mojibake — entire string is UTF-8 bytes read as cp1252.
|
||||
# Re-encode as cp1252 and decode as UTF-8.
|
||||
try:
|
||||
return text.encode("cp1252").decode("utf-8")
|
||||
except (UnicodeEncodeError, UnicodeDecodeError):
|
||||
pass
|
||||
|
||||
# Fallback 2: mixed strings — substitute known-bad patterns.
|
||||
for bad, good in _KNOWN_REPAIRS:
|
||||
if bad in text:
|
||||
text = text.replace(bad, good)
|
||||
return text
|
||||
119
tests/encoding.py
Normal file
119
tests/encoding.py
Normal file
@@ -0,0 +1,119 @@
|
||||
"""Unit tests for analysis/encoding.py — no external dependencies required."""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent / "analysis"))
|
||||
from encoding import repair_text, _KNOWN_REPAIRS
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Core contract
|
||||
|
||||
|
||||
def test_empty_string_unchanged():
|
||||
assert repair_text("") == ""
|
||||
|
||||
|
||||
def test_none_like_empty_unchanged():
|
||||
assert repair_text("") == ""
|
||||
|
||||
|
||||
def test_clean_ascii_unchanged():
|
||||
text = "This is a normal sentence with no encoding issues."
|
||||
assert repair_text(text) == text
|
||||
|
||||
|
||||
def test_clean_unicode_unchanged():
|
||||
text = "Café, naïve, résumé — proper Unicode already."
|
||||
result = repair_text(text)
|
||||
# Should either be unchanged or equivalently correct
|
||||
assert "Caf" in result and "na" in result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Known mojibake sequences (tasks.org AC4)
|
||||
# These are the 5 patterns explicitly listed in the acceptance criteria.
|
||||
|
||||
|
||||
def test_right_single_quote():
|
||||
"""’ → ' (U+2019 right single quotation mark)"""
|
||||
assert repair_text("Virginia’s") == "Virginia’s"
|
||||
|
||||
|
||||
def test_left_double_quote():
|
||||
"""“ → " (U+201C left double quotation mark)"""
|
||||
assert repair_text("“Hello") == "“Hello"
|
||||
|
||||
|
||||
def test_en_dash():
|
||||
"""â€" (where last char is U+201C) → – (U+2013 en dash)"""
|
||||
result = repair_text("pages 1–5")
|
||||
assert "–" in result or "—" in result or "-" in result
|
||||
|
||||
|
||||
def test_em_dash():
|
||||
"""â€" (where last char is U+201D) → — (U+2014 em dash)"""
|
||||
result = repair_text("word—word")
|
||||
assert "—" in result or "–" in result or "-" in result
|
||||
|
||||
|
||||
def test_right_double_quote():
|
||||
"""â€\x9d → " (U+201D right double quotation mark)"""
|
||||
result = repair_text("said†he")
|
||||
# Should not contain the raw artifact
|
||||
assert "â€" not in result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Round-trip: garbled text produces sensible output
|
||||
|
||||
|
||||
def test_garbled_sentence_repaired():
|
||||
"""A sentence with multiple mojibake chars is repaired to readable text."""
|
||||
# "Don't" with right single quote encoded as UTF-8, then decoded as cp1252
|
||||
# D o n ' t → D o n ’ t
|
||||
garbled = "Don’t worry"
|
||||
result = repair_text(garbled)
|
||||
assert "Don" in result and "t worry" in result
|
||||
assert "â€" not in result # artifact gone
|
||||
|
||||
|
||||
def test_clean_string_after_repair_has_no_artifacts():
|
||||
garbled = "She said “Hello†and left."
|
||||
result = repair_text(garbled)
|
||||
assert "â€" not in result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# FFFD replacement characters (from strict UTF-8 decode of cp1252 bytes)
|
||||
|
||||
|
||||
def test_fffd_preserved_not_crashed():
|
||||
"""repair_text must not raise on U+FFFD; it may or may not repair it."""
|
||||
text = "Virginia<EFBFBD>s Public Schools"
|
||||
result = repair_text(text)
|
||||
assert isinstance(result, str)
|
||||
assert "Virginia" in result
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _KNOWN_REPAIRS table structure
|
||||
|
||||
|
||||
def test_known_repairs_non_empty():
|
||||
assert len(_KNOWN_REPAIRS) > 0
|
||||
|
||||
|
||||
def test_known_repairs_are_pairs():
|
||||
for item in _KNOWN_REPAIRS:
|
||||
assert len(item) == 2
|
||||
bad, good = item
|
||||
assert isinstance(bad, str) and isinstance(good, str)
|
||||
|
||||
|
||||
def test_known_repairs_bad_not_equal_good():
|
||||
for bad, good in _KNOWN_REPAIRS:
|
||||
assert bad != good
|
||||
Reference in New Issue
Block a user