t1.1: scrape one forum via ViewComments.cfm POST pagination

Spider fetches ViewComments.cfm?GdocForumID=N with vPerPage=500, generates all page requests from page-1 metadata, and parses each div.Cbox for comment_id, author, date, title, text, reg_title, reg_desc. Handles span-wrapped comment text. Fixes UTF-8/windows-1251 meta-tag encoding mismatch. 9083 items, 15 empty-text (0.17%). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-05-05 12:28:07 -04:00
parent 02964312cb
commit beb5cf461b
6 changed files with 387 additions and 22 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -22,5 +22,9 @@ env/
 archive/
 # --- scrapy ---
 .scrapy/
 output/
 # --- misc ---
 .DS_Store
--- a/docs/tasks.org
+++ b/docs/tasks.org
@@ -1,5 +1,8 @@
-* [ ] t1.1: scrape one forum (1)
+* [X] t1.1: scrape one forum (1)
 Use https://www.townhall.virginia.gov/L/comments.cfm?GDocForumID=452 as the first forum. Scraper should be run manually at this step.
 ViewComments (townhall.virginia.gov/L/ViewComments.cfm?CommentID=#) appears to be raw list of all comments on forum - could be useful later for whole-scrape
 Append forum id to viewall per forum (townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452)
 Comments are hydrated in backend via js-cued button (AJAX?)
 ** acceptance criteria
 1. run manual scraper
   1. store proposal title and description
@@ -10,9 +13,9 @@ Use https://www.townhall.virginia.gov/L/comments.cfm?GDocForumID=452 as the firs
 ** notes
 ** evidence
- commit: 
+- commit: (see below)
- tests: 
+- tests: 7 passing (pytest tests/)
- datetime: 
+- datetime: 2026-05-05 12:26
 * [ ] t1.2: initial analysis pipeline
 Write a simple pipeline for both - prefer non-concurrent/async from scraping run. Should be run manually, separate from scraper. You may use scrapy, but are not required to.
--- a/scraper/items.py
+++ b/scraper/items.py
@@ -1,12 +1,17 @@
 # Define here the models for your scraped items
 #
 # See documentation in:
 # https://docs.scrapy.org/en/latest/topics/items.html
 import scrapy
-class ScraperItem(scrapy.Item):
+class CommentItem(scrapy.Item):
-    # define the fields for your item here like:
+    # Forum / regulation context
-    # name = scrapy.Field()
+    forum_id   = scrapy.Field()
-    pass
+    reg_title  = scrapy.Field()
    reg_desc   = scrapy.Field()
    # Comment metadata
    comment_id = scrapy.Field()
    author     = scrapy.Field()
    date       = scrapy.Field()
    title      = scrapy.Field()
    # Comment content
    text       = scrapy.Field()
--- a/scraper/settings.py
+++ b/scraper/settings.py
@@ -15,8 +15,7 @@ NEWSPIDER_MODULE = "scraper.spiders"
 ADDONS = {}
-# Crawl responsibly by identifying yourself (and your website) on the user-agent
+USER_AGENT = "vath-research-scraper/1.0 (public comment analysis; contact: research)"
 #USER_AGENT = "scraper (+http://www.yourdomain.com)"
 # Obey robots.txt rules
 ROBOTSTXT_OBEY = True
@@ -75,13 +74,24 @@ DOWNLOAD_DELAY = 1
 # Enable showing throttling stats for every response received:
 #AUTOTHROTTLE_DEBUG = False
-# Enable and configure HTTP caching (disabled by default)
+# HTTP cache — enabled during development to avoid re-hitting the server on test runs.
-# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+# Disable (or delete httpcache/) before a production run.
-#HTTPCACHE_ENABLED = True
+HTTPCACHE_ENABLED = True
-#HTTPCACHE_EXPIRATION_SECS = 0
+HTTPCACHE_EXPIRATION_SECS = 86400  # 24 h
-#HTTPCACHE_DIR = "httpcache"
+HTTPCACHE_DIR = "httpcache"
-#HTTPCACHE_IGNORE_HTTP_CODES = []
+
-#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
+# Output
 FEEDS = {
    "output/%(name)s_%(time)s.jsonl": {
        "format": "jsonlines",
        "encoding": "utf-8",
        "overwrite": False,
    }
 }
 # The site declares windows-1251 in a meta tag but sends valid UTF-8 bytes.
 # Force UTF-8 to prevent lxml from re-decoding via the meta charset.
 DEFAULT_RESPONSE_ENCODING = "utf-8"
 # Set settings whose default value is deprecated to a future-proof value
 FEED_EXPORT_ENCODING = "utf-8"
--- a/scraper/spiders/forum.py
+++ b/scraper/spiders/forum.py
@@ -0,0 +1,116 @@
 import math
 import re
 from urllib.parse import urlencode
 import scrapy
 from scraper.items import CommentItem
 _BASE = "https://www.townhall.virginia.gov/L/ViewComments.cfm"
 _NBSP = "\xa0"
 _REPLACEMENT_CHAR = "<EFBFBD>"
 def _view_url(forum_id):
    return f"{_BASE}?GdocForumID={forum_id}"
 class ForumSpider(scrapy.Spider):
    name = "forum"
    allowed_domains = ["townhall.virginia.gov"]
    # Override at runtime: scrapy crawl forum -a forum_id=452
    forum_id = "452"
    per_page = 500
    async def start(self):
        yield scrapy.FormRequest(
            _view_url(self.forum_id),
            formdata={"vPage": "1", "vPerPage": str(self.per_page), "sub1": "go"},
            callback=self.parse_comments,
            meta={"is_first": True},
        )
    # ------------------------------------------------------------------
    def parse_comments(self, response):
        if response.meta.get("is_first"):
            reg_title, reg_desc = self._reg_context(response)
            last_page = self._last_page(response)
            for page in range(2, last_page + 1):
                yield scrapy.FormRequest(
                    _view_url(self.forum_id),
                    formdata={"vPage": str(page), "vPerPage": str(self.per_page), "sub1": "go"},
                    callback=self.parse_comments,
                    meta={"reg_title": reg_title, "reg_desc": reg_desc},
                )
        else:
            reg_title = response.meta["reg_title"]
            reg_desc = response.meta["reg_desc"]
        for box in response.css("div.Cbox"):
            yield self._parse_box(box, reg_title, reg_desc)
    # ------------------------------------------------------------------
    def _parse_box(self, box, reg_title, reg_desc):
        cbox_id = box.attrib.get("id", "")
        comment_id = cbox_id[len("cbox"):] if cbox_id.startswith("cbox") else ""
        date = (
            box.css("div[style*='float: right'] div::text").get("")
            .replace(_NBSP, " ").strip()
        )
        author = (
            box.xpath('.//strong[contains(text(),"Commenter:")]/following-sibling::text()[1]')
            .get("").strip()
        )
        # Second <strong> in the commenter block is the comment title
        strongs = box.css("div > strong::text").getall()
        title = strongs[-1].strip() if len(strongs) > 1 else ""
        paragraphs = box.css(".divComment *::text, .divComment::text").getall()
        text = " ".join(p.strip() for p in paragraphs if p.strip())
        text = text.replace(_NBSP, " ").replace(_REPLACEMENT_CHAR, "'").strip()
        return CommentItem(
            forum_id=self.forum_id,
            reg_title=reg_title,
            reg_desc=reg_desc,
            comment_id=comment_id,
            author=author,
            date=date,
            title=title,
            text=text,
        )
    # ------------------------------------------------------------------
    def _reg_context(self, response):
        # Page shows: <strong>Guidance Document Change:</strong> description text...
        label_node = response.xpath('//strong[contains(text(),"Change:")]')
        label_text = label_node.css("::text").get("").strip()
        # Collect all sibling text nodes following the label
        siblings = label_node.xpath("following-sibling::text()").getall()
        raw = " ".join(t.strip() for t in siblings if t.strip())
        raw = raw.replace(_NBSP, " ").replace(_REPLACEMENT_CHAR, "'").strip()
        # reg_desc is the full description text
        reg_desc = raw
        # reg_title: text up to the first "was " clause or first 200 chars
        m = re.match(r"^(.+?)\s+(?:was |has |guidance document)", raw, re.IGNORECASE)
        reg_title = m.group(1).strip() if m else raw[:200]
        return reg_title, reg_desc
    def _last_page(self, response):
        hrefs = response.xpath(
            '//form[@name="page"]//a[contains(@href,"vpage.value=")]/@href'
        ).getall()
        pages = [
            int(m.group(1))
            for h in hrefs
            if (m := re.search(r"vpage\.value=(\d+)", h))
        ]
        return max(pages) if pages else 1
--- a/tests/test_forum_spider.py
+++ b/tests/test_forum_spider.py
@@ -0,0 +1,227 @@
 """Tests for ForumSpider parsing logic using fake HTML responses."""
 from scrapy.http import HtmlResponse, Request
 from scraper.spiders.forum import ForumSpider
 def fake_response(url, body, meta=None):
    req = Request(url=url, meta=meta or {})
    return HtmlResponse(url=url, body=body.encode("utf-8"), request=req)
 # ---------------------------------------------------------------------------
 # Minimal page HTML fragments
 PAGE1_HTML = """
 <html><body>
  <strong>Guidance Document Change:</strong> The Model Policies for the Treatment of Transgender Students
  was developed in response to House Bill 145 and Senate Bill 161.
  <div style="font-family: verdana;">
    <form name="page" id="page" action="ViewComments.cfm?GdocForumID=452" method="post">
      <input name="vPage" id="vpage" type="input" value="1">
      <input name="vPerPage" id="vPerPage" type="input" value="500">
      <a href="javascript:document.page.vpage.value=3;document.page.submit();">3</a>
      <a href="javascript:document.page.vpage.value=2;document.page.submit();">Next</a>
      <input type="submit" name="sub1" value="go">
    </form>
  </div>
  <div id="cbox101" class="Cbox">
    <div style="float: right; text-align: right;">
      <div style="background-color: white; border: 1px solid #cccccc; padding: 4px">1/4/21&nbsp;&nbsp;9:15 am</div>
    </div>
    <div>
      <strong>Commenter:</strong>
      Alice Example
      <br><br>
      <strong>I strongly support this</strong>
    </div>
    <div style="clear: right">&nbsp;</div>
    <div class="divComment">
      <p>This is a great policy for students.</p>
      <p>All schools should follow it.</p>
    </div>
    <div style="float: left; font-size: 90%;">
      CommentID: <a href="ViewComments.cfm?commentid=101">101</a>
    </div>
  </div>
  <div id="cbox102" class="Cbox">
    <div style="float: right; text-align: right;">
      <div style="background-color: white; border: 1px solid #cccccc; padding: 4px">1/5/21&nbsp;&nbsp;10:00 am</div>
    </div>
    <div>
      <strong>Commenter:</strong>
      Bob Sample
      <br><br>
      <strong>Opposed</strong>
    </div>
    <div style="clear: right">&nbsp;</div>
    <div class="divComment">
      <p>I do not support this guidance.</p>
    </div>
    <div style="float: left; font-size: 90%;">
      CommentID: <a href="ViewComments.cfm?commentid=102">102</a>
    </div>
  </div>
 </body></html>
 """
 PAGE2_HTML = """
 <html><body>
  <div id="cbox201" class="Cbox">
    <div style="float: right; text-align: right;">
      <div style="background-color: white; border: 1px solid #cccccc; padding: 4px">1/6/21&nbsp;&nbsp;11:00 am</div>
    </div>
    <div>
      <strong>Commenter:</strong>
      Carol T
      <br><br>
      <strong>Support</strong>
    </div>
    <div style="clear: right">&nbsp;</div>
    <div class="divComment">
      <p>This policy is long overdue.</p>
    </div>
  </div>
 </body></html>
 """
 def make_spider():
    return ForumSpider()
 # ---------------------------------------------------------------------------
 def test_page1_generates_remaining_page_requests():
    spider = make_spider()
    response = fake_response(
        "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
        PAGE1_HTML,
        meta={"is_first": True},
    )
    results = list(spider.parse_comments(response))
    form_reqs = [r for r in results if isinstance(r, scrapy.FormRequest)]
    # Pages 2 and 3 should be requested (last page link = 3)
    assert len(form_reqs) == 2
    pages = sorted(r.body.decode() for r in form_reqs)
    assert "vPage=2" in pages[0]
    assert "vPage=3" in pages[1]
 def test_page1_yields_items():
    spider = make_spider()
    response = fake_response(
        "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
        PAGE1_HTML,
        meta={"is_first": True},
    )
    results = list(spider.parse_comments(response))
    from scraper.items import CommentItem
    items = [r for r in results if isinstance(r, CommentItem)]
    assert len(items) == 2
 def test_comment_fields_parsed_correctly():
    spider = make_spider()
    response = fake_response(
        "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
        PAGE1_HTML,
        meta={"is_first": True},
    )
    from scraper.items import CommentItem
    items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)]
    item = items[0]
    assert item["comment_id"] == "101"
    assert item["author"] == "Alice Example"
    assert item["title"] == "I strongly support this"
    assert "great policy" in item["text"]
    assert "All schools" in item["text"]  # multi-paragraph joined
    assert "1/4/21" in item["date"]
 def test_reg_context_extracted():
    spider = make_spider()
    response = fake_response(
        "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
        PAGE1_HTML,
        meta={"is_first": True},
    )
    from scraper.items import CommentItem
    items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)]
    item = items[0]
    assert "Transgender Students" in item["reg_title"]
    assert "House Bill 145" in item["reg_desc"]
 def test_subsequent_page_uses_meta_reg_context():
    spider = make_spider()
    response = fake_response(
        "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
        PAGE2_HTML,
        meta={"reg_title": "Test Reg", "reg_desc": "Full description text"},
    )
    from scraper.items import CommentItem
    items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)]
    assert len(items) == 1
    assert items[0]["reg_title"] == "Test Reg"
    assert items[0]["author"] == "Carol T"
 def test_last_page_detection():
    spider = make_spider()
    response = fake_response(
        "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
        PAGE1_HTML,
        meta={"is_first": True},
    )
    assert spider._last_page(response) == 3
 import scrapy
 SPAN_WRAPPED_HTML = """
 <html><body>
  <strong>Guidance Document Change:</strong> Some regulation was developed.
  <form name="page" id="page" action="ViewComments.cfm?GdocForumID=452" method="post">
    <input name="vPage" value="1"><input name="vPerPage" value="500">
    <a href="javascript:document.page.vpage.value=1;document.page.submit();">1</a>
    <input type="submit" name="sub1" value="go">
  </form>
  <div id="cbox301" class="Cbox">
    <div style="float: right; text-align: right;">
      <div style="background-color: white; border: 1px solid #cccccc; padding: 4px">2/1/21&nbsp;&nbsp;8:00 am</div>
    </div>
    <div>
      <strong>Commenter:</strong>
      Dan Span
      <br><br>
      <strong>Opposed</strong>
    </div>
    <div style="clear: right">&nbsp;</div>
    <div class="divComment">
      <!DOCTYPE html><html><head></head><body>
      <p style="margin: 0in;"><span style="font-size: 10.5pt;">Text inside a span element.</span></p>
      </body></html>
    </div>
  </div>
 </body></html>
 """
 def test_span_wrapped_text_is_extracted():
    spider = make_spider()
    response = fake_response(
        "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
        SPAN_WRAPPED_HTML,
        meta={"is_first": True},
    )
    from scraper.items import CommentItem
    items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)]
    assert len(items) == 1
    assert "Text inside a span element" in items[0]["text"]