Spider fetches ViewComments.cfm?GdocForumID=N with vPerPage=500, generates all page requests from page-1 metadata, and parses each div.Cbox for comment_id, author, date, title, text, reg_title, reg_desc. Handles span-wrapped comment text. Fixes UTF-8/windows-1251 meta-tag encoding mismatch. 9083 items, 15 empty-text (0.17%). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
117 lines
4.1 KiB
Python
117 lines
4.1 KiB
Python
import math
|
||
import re
|
||
from urllib.parse import urlencode
|
||
|
||
import scrapy
|
||
|
||
from scraper.items import CommentItem
|
||
|
||
_BASE = "https://www.townhall.virginia.gov/L/ViewComments.cfm"
|
||
_NBSP = "\xa0"
|
||
_REPLACEMENT_CHAR = "<EFBFBD>"
|
||
|
||
|
||
def _view_url(forum_id):
|
||
return f"{_BASE}?GdocForumID={forum_id}"
|
||
|
||
|
||
class ForumSpider(scrapy.Spider):
|
||
name = "forum"
|
||
allowed_domains = ["townhall.virginia.gov"]
|
||
|
||
# Override at runtime: scrapy crawl forum -a forum_id=452
|
||
forum_id = "452"
|
||
per_page = 500
|
||
|
||
async def start(self):
|
||
yield scrapy.FormRequest(
|
||
_view_url(self.forum_id),
|
||
formdata={"vPage": "1", "vPerPage": str(self.per_page), "sub1": "go"},
|
||
callback=self.parse_comments,
|
||
meta={"is_first": True},
|
||
)
|
||
|
||
# ------------------------------------------------------------------
|
||
def parse_comments(self, response):
|
||
if response.meta.get("is_first"):
|
||
reg_title, reg_desc = self._reg_context(response)
|
||
last_page = self._last_page(response)
|
||
for page in range(2, last_page + 1):
|
||
yield scrapy.FormRequest(
|
||
_view_url(self.forum_id),
|
||
formdata={"vPage": str(page), "vPerPage": str(self.per_page), "sub1": "go"},
|
||
callback=self.parse_comments,
|
||
meta={"reg_title": reg_title, "reg_desc": reg_desc},
|
||
)
|
||
else:
|
||
reg_title = response.meta["reg_title"]
|
||
reg_desc = response.meta["reg_desc"]
|
||
|
||
for box in response.css("div.Cbox"):
|
||
yield self._parse_box(box, reg_title, reg_desc)
|
||
|
||
# ------------------------------------------------------------------
|
||
def _parse_box(self, box, reg_title, reg_desc):
|
||
cbox_id = box.attrib.get("id", "")
|
||
comment_id = cbox_id[len("cbox"):] if cbox_id.startswith("cbox") else ""
|
||
|
||
date = (
|
||
box.css("div[style*='float: right'] div::text").get("")
|
||
.replace(_NBSP, " ").strip()
|
||
)
|
||
|
||
author = (
|
||
box.xpath('.//strong[contains(text(),"Commenter:")]/following-sibling::text()[1]')
|
||
.get("").strip()
|
||
)
|
||
|
||
# Second <strong> in the commenter block is the comment title
|
||
strongs = box.css("div > strong::text").getall()
|
||
title = strongs[-1].strip() if len(strongs) > 1 else ""
|
||
|
||
paragraphs = box.css(".divComment *::text, .divComment::text").getall()
|
||
text = " ".join(p.strip() for p in paragraphs if p.strip())
|
||
text = text.replace(_NBSP, " ").replace(_REPLACEMENT_CHAR, "'").strip()
|
||
|
||
return CommentItem(
|
||
forum_id=self.forum_id,
|
||
reg_title=reg_title,
|
||
reg_desc=reg_desc,
|
||
comment_id=comment_id,
|
||
author=author,
|
||
date=date,
|
||
title=title,
|
||
text=text,
|
||
)
|
||
|
||
# ------------------------------------------------------------------
|
||
def _reg_context(self, response):
|
||
# Page shows: <strong>Guidance Document Change:</strong> description text...
|
||
label_node = response.xpath('//strong[contains(text(),"Change:")]')
|
||
label_text = label_node.css("::text").get("").strip()
|
||
|
||
# Collect all sibling text nodes following the label
|
||
siblings = label_node.xpath("following-sibling::text()").getall()
|
||
raw = " ".join(t.strip() for t in siblings if t.strip())
|
||
raw = raw.replace(_NBSP, " ").replace(_REPLACEMENT_CHAR, "'").strip()
|
||
|
||
# reg_desc is the full description text
|
||
reg_desc = raw
|
||
|
||
# reg_title: text up to the first "was " clause or first 200 chars
|
||
m = re.match(r"^(.+?)\s+(?:was |has |guidance document)", raw, re.IGNORECASE)
|
||
reg_title = m.group(1).strip() if m else raw[:200]
|
||
|
||
return reg_title, reg_desc
|
||
|
||
def _last_page(self, response):
|
||
hrefs = response.xpath(
|
||
'//form[@name="page"]//a[contains(@href,"vpage.value=")]/@href'
|
||
).getall()
|
||
pages = [
|
||
int(m.group(1))
|
||
for h in hrefs
|
||
if (m := re.search(r"vpage\.value=(\d+)", h))
|
||
]
|
||
return max(pages) if pages else 1
|