t1.1: scrape one forum via ViewComments.cfm POST pagination

Spider fetches ViewComments.cfm?GdocForumID=N with vPerPage=500, generates all page requests from page-1 metadata, and parses each div.Cbox for comment_id, author, date, title, text, reg_title, reg_desc. Handles span-wrapped comment text. Fixes UTF-8/windows-1251 meta-tag encoding mismatch. 9083 items, 15 empty-text (0.17%). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-05 12:28:07 -04:00
parent 02964312cb
commit beb5cf461b
6 changed files with 387 additions and 22 deletions
--- a/scraper/items.py
+++ b/scraper/items.py
@@ -1,12 +1,17 @@
-# Define here the models for your scraped items
-#
-# See documentation in:
-# https://docs.scrapy.org/en/latest/topics/items.html
-
 import scrapy


-class ScraperItem(scrapy.Item):
-    # define the fields for your item here like:
-    # name = scrapy.Field()
-    pass
+class CommentItem(scrapy.Item):
+    # Forum / regulation context
+    forum_id   = scrapy.Field()
+    reg_title  = scrapy.Field()
+    reg_desc   = scrapy.Field()
+
+    # Comment metadata
+    comment_id = scrapy.Field()
+    author     = scrapy.Field()
+    date       = scrapy.Field()
+    title      = scrapy.Field()
+
+    # Comment content
+    text       = scrapy.Field()
--- a/scraper/settings.py
+++ b/scraper/settings.py
@@ -15,8 +15,7 @@ NEWSPIDER_MODULE = "scraper.spiders"
 ADDONS = {}


-# Crawl responsibly by identifying yourself (and your website) on the user-agent
-#USER_AGENT = "scraper (+http://www.yourdomain.com)"
+USER_AGENT = "vath-research-scraper/1.0 (public comment analysis; contact: research)"

 # Obey robots.txt rules
 ROBOTSTXT_OBEY = True
@@ -75,13 +74,24 @@ DOWNLOAD_DELAY = 1
 # Enable showing throttling stats for every response received:
 #AUTOTHROTTLE_DEBUG = False

-# Enable and configure HTTP caching (disabled by default)
-# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
-#HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS = 0
-#HTTPCACHE_DIR = "httpcache"
-#HTTPCACHE_IGNORE_HTTP_CODES = []
-#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+# HTTP cache — enabled during development to avoid re-hitting the server on test runs.
+# Disable (or delete httpcache/) before a production run.
+HTTPCACHE_ENABLED = True
+HTTPCACHE_EXPIRATION_SECS = 86400  # 24 h
+HTTPCACHE_DIR = "httpcache"
+
+# Output
+FEEDS = {
+    "output/%(name)s_%(time)s.jsonl": {
+        "format": "jsonlines",
+        "encoding": "utf-8",
+        "overwrite": False,
+    }
+}
+
+# The site declares windows-1251 in a meta tag but sends valid UTF-8 bytes.
+# Force UTF-8 to prevent lxml from re-decoding via the meta charset.
+DEFAULT_RESPONSE_ENCODING = "utf-8"

 # Set settings whose default value is deprecated to a future-proof value
 FEED_EXPORT_ENCODING = "utf-8"
--- a/scraper/spiders/forum.py
+++ b/scraper/spiders/forum.py
@@ -0,0 +1,116 @@
+import math
+import re
+from urllib.parse import urlencode
+
+import scrapy
+
+from scraper.items import CommentItem
+
+_BASE = "https://www.townhall.virginia.gov/L/ViewComments.cfm"
+_NBSP = "\xa0"
+_REPLACEMENT_CHAR = "<EFBFBD>"
+
+
+def _view_url(forum_id):
+    return f"{_BASE}?GdocForumID={forum_id}"
+
+
+class ForumSpider(scrapy.Spider):
+    name = "forum"
+    allowed_domains = ["townhall.virginia.gov"]
+
+    # Override at runtime: scrapy crawl forum -a forum_id=452
+    forum_id = "452"
+    per_page = 500
+
+    async def start(self):
+        yield scrapy.FormRequest(
+            _view_url(self.forum_id),
+            formdata={"vPage": "1", "vPerPage": str(self.per_page), "sub1": "go"},
+            callback=self.parse_comments,
+            meta={"is_first": True},
+        )
+
+    # ------------------------------------------------------------------
+    def parse_comments(self, response):
+        if response.meta.get("is_first"):
+            reg_title, reg_desc = self._reg_context(response)
+            last_page = self._last_page(response)
+            for page in range(2, last_page + 1):
+                yield scrapy.FormRequest(
+                    _view_url(self.forum_id),
+                    formdata={"vPage": str(page), "vPerPage": str(self.per_page), "sub1": "go"},
+                    callback=self.parse_comments,
+                    meta={"reg_title": reg_title, "reg_desc": reg_desc},
+                )
+        else:
+            reg_title = response.meta["reg_title"]
+            reg_desc = response.meta["reg_desc"]
+
+        for box in response.css("div.Cbox"):
+            yield self._parse_box(box, reg_title, reg_desc)
+
+    # ------------------------------------------------------------------
+    def _parse_box(self, box, reg_title, reg_desc):
+        cbox_id = box.attrib.get("id", "")
+        comment_id = cbox_id[len("cbox"):] if cbox_id.startswith("cbox") else ""
+
+        date = (
+            box.css("div[style*='float: right'] div::text").get("")
+            .replace(_NBSP, " ").strip()
+        )
+
+        author = (
+            box.xpath('.//strong[contains(text(),"Commenter:")]/following-sibling::text()[1]')
+            .get("").strip()
+        )
+
+        # Second <strong> in the commenter block is the comment title
+        strongs = box.css("div > strong::text").getall()
+        title = strongs[-1].strip() if len(strongs) > 1 else ""
+
+        paragraphs = box.css(".divComment *::text, .divComment::text").getall()
+        text = " ".join(p.strip() for p in paragraphs if p.strip())
+        text = text.replace(_NBSP, " ").replace(_REPLACEMENT_CHAR, "'").strip()
+
+        return CommentItem(
+            forum_id=self.forum_id,
+            reg_title=reg_title,
+            reg_desc=reg_desc,
+            comment_id=comment_id,
+            author=author,
+            date=date,
+            title=title,
+            text=text,
+        )
+
+    # ------------------------------------------------------------------
+    def _reg_context(self, response):
+        # Page shows: <strong>Guidance Document Change:</strong> description text...
+        label_node = response.xpath('//strong[contains(text(),"Change:")]')
+        label_text = label_node.css("::text").get("").strip()
+
+        # Collect all sibling text nodes following the label
+        siblings = label_node.xpath("following-sibling::text()").getall()
+        raw = " ".join(t.strip() for t in siblings if t.strip())
+        raw = raw.replace(_NBSP, " ").replace(_REPLACEMENT_CHAR, "'").strip()
+
+        # reg_desc is the full description text
+        reg_desc = raw
+
+        # reg_title: text up to the first "was " clause or first 200 chars
+        m = re.match(r"^(.+?)\s+(?:was |has |guidance document)", raw, re.IGNORECASE)
+        reg_title = m.group(1).strip() if m else raw[:200]
+
+        return reg_title, reg_desc
+
+    def _last_page(self, response):
+        hrefs = response.xpath(
+            '//form[@name="page"]//a[contains(@href,"vpage.value=")]/@href'
+        ).getall()
+        pages = [
+            int(m.group(1))
+            for h in hrefs
+            if (m := re.search(r"vpage\.value=(\d+)", h))
+        ]
+        return max(pages) if pages else 1