t1.1: scrape one forum via ViewComments.cfm POST pagination
Spider fetches ViewComments.cfm?GdocForumID=N with vPerPage=500, generates all page requests from page-1 metadata, and parses each div.Cbox for comment_id, author, date, title, text, reg_title, reg_desc. Handles span-wrapped comment text. Fixes UTF-8/windows-1251 meta-tag encoding mismatch. 9083 items, 15 empty-text (0.17%). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
4
.gitignore
vendored
4
.gitignore
vendored
@@ -22,5 +22,9 @@ env/
|
||||
archive/
|
||||
|
||||
|
||||
# --- scrapy ---
|
||||
.scrapy/
|
||||
output/
|
||||
|
||||
# --- misc ---
|
||||
.DS_Store
|
||||
@@ -1,5 +1,8 @@
|
||||
* [ ] t1.1: scrape one forum (1)
|
||||
* [X] t1.1: scrape one forum (1)
|
||||
Use https://www.townhall.virginia.gov/L/comments.cfm?GDocForumID=452 as the first forum. Scraper should be run manually at this step.
|
||||
ViewComments (townhall.virginia.gov/L/ViewComments.cfm?CommentID=#) appears to be raw list of all comments on forum - could be useful later for whole-scrape
|
||||
Append forum id to viewall per forum (townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452)
|
||||
Comments are hydrated in backend via js-cued button (AJAX?)
|
||||
** acceptance criteria
|
||||
1. run manual scraper
|
||||
1. store proposal title and description
|
||||
@@ -10,9 +13,9 @@ Use https://www.townhall.virginia.gov/L/comments.cfm?GDocForumID=452 as the firs
|
||||
** notes
|
||||
|
||||
** evidence
|
||||
- commit:
|
||||
- tests:
|
||||
- datetime:
|
||||
- commit: (see below)
|
||||
- tests: 7 passing (pytest tests/)
|
||||
- datetime: 2026-05-05 12:26
|
||||
|
||||
* [ ] t1.2: initial analysis pipeline
|
||||
Write a simple pipeline for both - prefer non-concurrent/async from scraping run. Should be run manually, separate from scraper. You may use scrapy, but are not required to.
|
||||
|
||||
@@ -1,12 +1,17 @@
|
||||
# Define here the models for your scraped items
|
||||
#
|
||||
# See documentation in:
|
||||
# https://docs.scrapy.org/en/latest/topics/items.html
|
||||
|
||||
import scrapy
|
||||
|
||||
|
||||
class ScraperItem(scrapy.Item):
|
||||
# define the fields for your item here like:
|
||||
# name = scrapy.Field()
|
||||
pass
|
||||
class CommentItem(scrapy.Item):
|
||||
# Forum / regulation context
|
||||
forum_id = scrapy.Field()
|
||||
reg_title = scrapy.Field()
|
||||
reg_desc = scrapy.Field()
|
||||
|
||||
# Comment metadata
|
||||
comment_id = scrapy.Field()
|
||||
author = scrapy.Field()
|
||||
date = scrapy.Field()
|
||||
title = scrapy.Field()
|
||||
|
||||
# Comment content
|
||||
text = scrapy.Field()
|
||||
|
||||
@@ -15,8 +15,7 @@ NEWSPIDER_MODULE = "scraper.spiders"
|
||||
ADDONS = {}
|
||||
|
||||
|
||||
# Crawl responsibly by identifying yourself (and your website) on the user-agent
|
||||
#USER_AGENT = "scraper (+http://www.yourdomain.com)"
|
||||
USER_AGENT = "vath-research-scraper/1.0 (public comment analysis; contact: research)"
|
||||
|
||||
# Obey robots.txt rules
|
||||
ROBOTSTXT_OBEY = True
|
||||
@@ -75,13 +74,24 @@ DOWNLOAD_DELAY = 1
|
||||
# Enable showing throttling stats for every response received:
|
||||
#AUTOTHROTTLE_DEBUG = False
|
||||
|
||||
# Enable and configure HTTP caching (disabled by default)
|
||||
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
|
||||
#HTTPCACHE_ENABLED = True
|
||||
#HTTPCACHE_EXPIRATION_SECS = 0
|
||||
#HTTPCACHE_DIR = "httpcache"
|
||||
#HTTPCACHE_IGNORE_HTTP_CODES = []
|
||||
#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage"
|
||||
# HTTP cache — enabled during development to avoid re-hitting the server on test runs.
|
||||
# Disable (or delete httpcache/) before a production run.
|
||||
HTTPCACHE_ENABLED = True
|
||||
HTTPCACHE_EXPIRATION_SECS = 86400 # 24 h
|
||||
HTTPCACHE_DIR = "httpcache"
|
||||
|
||||
# Output
|
||||
FEEDS = {
|
||||
"output/%(name)s_%(time)s.jsonl": {
|
||||
"format": "jsonlines",
|
||||
"encoding": "utf-8",
|
||||
"overwrite": False,
|
||||
}
|
||||
}
|
||||
|
||||
# The site declares windows-1251 in a meta tag but sends valid UTF-8 bytes.
|
||||
# Force UTF-8 to prevent lxml from re-decoding via the meta charset.
|
||||
DEFAULT_RESPONSE_ENCODING = "utf-8"
|
||||
|
||||
# Set settings whose default value is deprecated to a future-proof value
|
||||
FEED_EXPORT_ENCODING = "utf-8"
|
||||
|
||||
116
scraper/spiders/forum.py
Normal file
116
scraper/spiders/forum.py
Normal file
@@ -0,0 +1,116 @@
|
||||
import math
|
||||
import re
|
||||
from urllib.parse import urlencode
|
||||
|
||||
import scrapy
|
||||
|
||||
from scraper.items import CommentItem
|
||||
|
||||
_BASE = "https://www.townhall.virginia.gov/L/ViewComments.cfm"
|
||||
_NBSP = "\xa0"
|
||||
_REPLACEMENT_CHAR = "<EFBFBD>"
|
||||
|
||||
|
||||
def _view_url(forum_id):
|
||||
return f"{_BASE}?GdocForumID={forum_id}"
|
||||
|
||||
|
||||
class ForumSpider(scrapy.Spider):
|
||||
name = "forum"
|
||||
allowed_domains = ["townhall.virginia.gov"]
|
||||
|
||||
# Override at runtime: scrapy crawl forum -a forum_id=452
|
||||
forum_id = "452"
|
||||
per_page = 500
|
||||
|
||||
async def start(self):
|
||||
yield scrapy.FormRequest(
|
||||
_view_url(self.forum_id),
|
||||
formdata={"vPage": "1", "vPerPage": str(self.per_page), "sub1": "go"},
|
||||
callback=self.parse_comments,
|
||||
meta={"is_first": True},
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
def parse_comments(self, response):
|
||||
if response.meta.get("is_first"):
|
||||
reg_title, reg_desc = self._reg_context(response)
|
||||
last_page = self._last_page(response)
|
||||
for page in range(2, last_page + 1):
|
||||
yield scrapy.FormRequest(
|
||||
_view_url(self.forum_id),
|
||||
formdata={"vPage": str(page), "vPerPage": str(self.per_page), "sub1": "go"},
|
||||
callback=self.parse_comments,
|
||||
meta={"reg_title": reg_title, "reg_desc": reg_desc},
|
||||
)
|
||||
else:
|
||||
reg_title = response.meta["reg_title"]
|
||||
reg_desc = response.meta["reg_desc"]
|
||||
|
||||
for box in response.css("div.Cbox"):
|
||||
yield self._parse_box(box, reg_title, reg_desc)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
def _parse_box(self, box, reg_title, reg_desc):
|
||||
cbox_id = box.attrib.get("id", "")
|
||||
comment_id = cbox_id[len("cbox"):] if cbox_id.startswith("cbox") else ""
|
||||
|
||||
date = (
|
||||
box.css("div[style*='float: right'] div::text").get("")
|
||||
.replace(_NBSP, " ").strip()
|
||||
)
|
||||
|
||||
author = (
|
||||
box.xpath('.//strong[contains(text(),"Commenter:")]/following-sibling::text()[1]')
|
||||
.get("").strip()
|
||||
)
|
||||
|
||||
# Second <strong> in the commenter block is the comment title
|
||||
strongs = box.css("div > strong::text").getall()
|
||||
title = strongs[-1].strip() if len(strongs) > 1 else ""
|
||||
|
||||
paragraphs = box.css(".divComment *::text, .divComment::text").getall()
|
||||
text = " ".join(p.strip() for p in paragraphs if p.strip())
|
||||
text = text.replace(_NBSP, " ").replace(_REPLACEMENT_CHAR, "'").strip()
|
||||
|
||||
return CommentItem(
|
||||
forum_id=self.forum_id,
|
||||
reg_title=reg_title,
|
||||
reg_desc=reg_desc,
|
||||
comment_id=comment_id,
|
||||
author=author,
|
||||
date=date,
|
||||
title=title,
|
||||
text=text,
|
||||
)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
def _reg_context(self, response):
|
||||
# Page shows: <strong>Guidance Document Change:</strong> description text...
|
||||
label_node = response.xpath('//strong[contains(text(),"Change:")]')
|
||||
label_text = label_node.css("::text").get("").strip()
|
||||
|
||||
# Collect all sibling text nodes following the label
|
||||
siblings = label_node.xpath("following-sibling::text()").getall()
|
||||
raw = " ".join(t.strip() for t in siblings if t.strip())
|
||||
raw = raw.replace(_NBSP, " ").replace(_REPLACEMENT_CHAR, "'").strip()
|
||||
|
||||
# reg_desc is the full description text
|
||||
reg_desc = raw
|
||||
|
||||
# reg_title: text up to the first "was " clause or first 200 chars
|
||||
m = re.match(r"^(.+?)\s+(?:was |has |guidance document)", raw, re.IGNORECASE)
|
||||
reg_title = m.group(1).strip() if m else raw[:200]
|
||||
|
||||
return reg_title, reg_desc
|
||||
|
||||
def _last_page(self, response):
|
||||
hrefs = response.xpath(
|
||||
'//form[@name="page"]//a[contains(@href,"vpage.value=")]/@href'
|
||||
).getall()
|
||||
pages = [
|
||||
int(m.group(1))
|
||||
for h in hrefs
|
||||
if (m := re.search(r"vpage\.value=(\d+)", h))
|
||||
]
|
||||
return max(pages) if pages else 1
|
||||
227
tests/test_forum_spider.py
Normal file
227
tests/test_forum_spider.py
Normal file
@@ -0,0 +1,227 @@
|
||||
"""Tests for ForumSpider parsing logic using fake HTML responses."""
|
||||
|
||||
from scrapy.http import HtmlResponse, Request
|
||||
|
||||
from scraper.spiders.forum import ForumSpider
|
||||
|
||||
|
||||
def fake_response(url, body, meta=None):
|
||||
req = Request(url=url, meta=meta or {})
|
||||
return HtmlResponse(url=url, body=body.encode("utf-8"), request=req)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Minimal page HTML fragments
|
||||
|
||||
PAGE1_HTML = """
|
||||
<html><body>
|
||||
<strong>Guidance Document Change:</strong> The Model Policies for the Treatment of Transgender Students
|
||||
was developed in response to House Bill 145 and Senate Bill 161.
|
||||
|
||||
<div style="font-family: verdana;">
|
||||
<form name="page" id="page" action="ViewComments.cfm?GdocForumID=452" method="post">
|
||||
<input name="vPage" id="vpage" type="input" value="1">
|
||||
<input name="vPerPage" id="vPerPage" type="input" value="500">
|
||||
<a href="javascript:document.page.vpage.value=3;document.page.submit();">3</a>
|
||||
<a href="javascript:document.page.vpage.value=2;document.page.submit();">Next</a>
|
||||
<input type="submit" name="sub1" value="go">
|
||||
</form>
|
||||
</div>
|
||||
|
||||
<div id="cbox101" class="Cbox">
|
||||
<div style="float: right; text-align: right;">
|
||||
<div style="background-color: white; border: 1px solid #cccccc; padding: 4px">1/4/21 9:15 am</div>
|
||||
</div>
|
||||
<div>
|
||||
<strong>Commenter:</strong>
|
||||
Alice Example
|
||||
<br><br>
|
||||
<strong>I strongly support this</strong>
|
||||
</div>
|
||||
<div style="clear: right"> </div>
|
||||
<div class="divComment">
|
||||
<p>This is a great policy for students.</p>
|
||||
<p>All schools should follow it.</p>
|
||||
</div>
|
||||
<div style="float: left; font-size: 90%;">
|
||||
CommentID: <a href="ViewComments.cfm?commentid=101">101</a>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div id="cbox102" class="Cbox">
|
||||
<div style="float: right; text-align: right;">
|
||||
<div style="background-color: white; border: 1px solid #cccccc; padding: 4px">1/5/21 10:00 am</div>
|
||||
</div>
|
||||
<div>
|
||||
<strong>Commenter:</strong>
|
||||
Bob Sample
|
||||
<br><br>
|
||||
<strong>Opposed</strong>
|
||||
</div>
|
||||
<div style="clear: right"> </div>
|
||||
<div class="divComment">
|
||||
<p>I do not support this guidance.</p>
|
||||
</div>
|
||||
<div style="float: left; font-size: 90%;">
|
||||
CommentID: <a href="ViewComments.cfm?commentid=102">102</a>
|
||||
</div>
|
||||
</div>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
PAGE2_HTML = """
|
||||
<html><body>
|
||||
<div id="cbox201" class="Cbox">
|
||||
<div style="float: right; text-align: right;">
|
||||
<div style="background-color: white; border: 1px solid #cccccc; padding: 4px">1/6/21 11:00 am</div>
|
||||
</div>
|
||||
<div>
|
||||
<strong>Commenter:</strong>
|
||||
Carol T
|
||||
<br><br>
|
||||
<strong>Support</strong>
|
||||
</div>
|
||||
<div style="clear: right"> </div>
|
||||
<div class="divComment">
|
||||
<p>This policy is long overdue.</p>
|
||||
</div>
|
||||
</div>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
|
||||
def make_spider():
|
||||
return ForumSpider()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_page1_generates_remaining_page_requests():
|
||||
spider = make_spider()
|
||||
response = fake_response(
|
||||
"https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
|
||||
PAGE1_HTML,
|
||||
meta={"is_first": True},
|
||||
)
|
||||
results = list(spider.parse_comments(response))
|
||||
form_reqs = [r for r in results if isinstance(r, scrapy.FormRequest)]
|
||||
# Pages 2 and 3 should be requested (last page link = 3)
|
||||
assert len(form_reqs) == 2
|
||||
pages = sorted(r.body.decode() for r in form_reqs)
|
||||
assert "vPage=2" in pages[0]
|
||||
assert "vPage=3" in pages[1]
|
||||
|
||||
|
||||
def test_page1_yields_items():
|
||||
spider = make_spider()
|
||||
response = fake_response(
|
||||
"https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
|
||||
PAGE1_HTML,
|
||||
meta={"is_first": True},
|
||||
)
|
||||
results = list(spider.parse_comments(response))
|
||||
from scraper.items import CommentItem
|
||||
items = [r for r in results if isinstance(r, CommentItem)]
|
||||
assert len(items) == 2
|
||||
|
||||
|
||||
def test_comment_fields_parsed_correctly():
|
||||
spider = make_spider()
|
||||
response = fake_response(
|
||||
"https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
|
||||
PAGE1_HTML,
|
||||
meta={"is_first": True},
|
||||
)
|
||||
from scraper.items import CommentItem
|
||||
items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)]
|
||||
item = items[0]
|
||||
assert item["comment_id"] == "101"
|
||||
assert item["author"] == "Alice Example"
|
||||
assert item["title"] == "I strongly support this"
|
||||
assert "great policy" in item["text"]
|
||||
assert "All schools" in item["text"] # multi-paragraph joined
|
||||
assert "1/4/21" in item["date"]
|
||||
|
||||
|
||||
def test_reg_context_extracted():
|
||||
spider = make_spider()
|
||||
response = fake_response(
|
||||
"https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
|
||||
PAGE1_HTML,
|
||||
meta={"is_first": True},
|
||||
)
|
||||
from scraper.items import CommentItem
|
||||
items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)]
|
||||
item = items[0]
|
||||
assert "Transgender Students" in item["reg_title"]
|
||||
assert "House Bill 145" in item["reg_desc"]
|
||||
|
||||
|
||||
def test_subsequent_page_uses_meta_reg_context():
|
||||
spider = make_spider()
|
||||
response = fake_response(
|
||||
"https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
|
||||
PAGE2_HTML,
|
||||
meta={"reg_title": "Test Reg", "reg_desc": "Full description text"},
|
||||
)
|
||||
from scraper.items import CommentItem
|
||||
items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)]
|
||||
assert len(items) == 1
|
||||
assert items[0]["reg_title"] == "Test Reg"
|
||||
assert items[0]["author"] == "Carol T"
|
||||
|
||||
|
||||
def test_last_page_detection():
|
||||
spider = make_spider()
|
||||
response = fake_response(
|
||||
"https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
|
||||
PAGE1_HTML,
|
||||
meta={"is_first": True},
|
||||
)
|
||||
assert spider._last_page(response) == 3
|
||||
|
||||
|
||||
import scrapy
|
||||
|
||||
SPAN_WRAPPED_HTML = """
|
||||
<html><body>
|
||||
<strong>Guidance Document Change:</strong> Some regulation was developed.
|
||||
|
||||
<form name="page" id="page" action="ViewComments.cfm?GdocForumID=452" method="post">
|
||||
<input name="vPage" value="1"><input name="vPerPage" value="500">
|
||||
<a href="javascript:document.page.vpage.value=1;document.page.submit();">1</a>
|
||||
<input type="submit" name="sub1" value="go">
|
||||
</form>
|
||||
|
||||
<div id="cbox301" class="Cbox">
|
||||
<div style="float: right; text-align: right;">
|
||||
<div style="background-color: white; border: 1px solid #cccccc; padding: 4px">2/1/21 8:00 am</div>
|
||||
</div>
|
||||
<div>
|
||||
<strong>Commenter:</strong>
|
||||
Dan Span
|
||||
<br><br>
|
||||
<strong>Opposed</strong>
|
||||
</div>
|
||||
<div style="clear: right"> </div>
|
||||
<div class="divComment">
|
||||
<!DOCTYPE html><html><head></head><body>
|
||||
<p style="margin: 0in;"><span style="font-size: 10.5pt;">Text inside a span element.</span></p>
|
||||
</body></html>
|
||||
</div>
|
||||
</div>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
|
||||
def test_span_wrapped_text_is_extracted():
|
||||
spider = make_spider()
|
||||
response = fake_response(
|
||||
"https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
|
||||
SPAN_WRAPPED_HTML,
|
||||
meta={"is_first": True},
|
||||
)
|
||||
from scraper.items import CommentItem
|
||||
items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)]
|
||||
assert len(items) == 1
|
||||
assert "Text inside a span element" in items[0]["text"]
|
||||
Reference in New Issue
Block a user