import math import re from urllib.parse import urlencode import scrapy from scraper.items import CommentItem _BASE = "https://www.townhall.virginia.gov/L/ViewComments.cfm" _NBSP = "\xa0" _REPLACEMENT_CHAR = "�" def _view_url(forum_id): return f"{_BASE}?GdocForumID={forum_id}" class ForumSpider(scrapy.Spider): name = "forum" allowed_domains = ["townhall.virginia.gov"] # Override at runtime: scrapy crawl forum -a forum_id=452 forum_id = "452" per_page = 500 async def start(self): yield scrapy.FormRequest( _view_url(self.forum_id), formdata={"vPage": "1", "vPerPage": str(self.per_page), "sub1": "go"}, callback=self.parse_comments, meta={"is_first": True}, ) # ------------------------------------------------------------------ def parse_comments(self, response): if response.meta.get("is_first"): reg_title, reg_desc = self._reg_context(response) last_page = self._last_page(response) for page in range(2, last_page + 1): yield scrapy.FormRequest( _view_url(self.forum_id), formdata={"vPage": str(page), "vPerPage": str(self.per_page), "sub1": "go"}, callback=self.parse_comments, meta={"reg_title": reg_title, "reg_desc": reg_desc}, ) else: reg_title = response.meta["reg_title"] reg_desc = response.meta["reg_desc"] for box in response.css("div.Cbox"): yield self._parse_box(box, reg_title, reg_desc) # ------------------------------------------------------------------ def _parse_box(self, box, reg_title, reg_desc): cbox_id = box.attrib.get("id", "") comment_id = cbox_id[len("cbox"):] if cbox_id.startswith("cbox") else "" date = ( box.css("div[style*='float: right'] div::text").get("") .replace(_NBSP, " ").strip() ) author = ( box.xpath('.//strong[contains(text(),"Commenter:")]/following-sibling::text()[1]') .get("").strip() ) # Second in the commenter block is the comment title strongs = box.css("div > strong::text").getall() title = strongs[-1].strip() if len(strongs) > 1 else "" paragraphs = box.css(".divComment *::text, .divComment::text").getall() text = " ".join(p.strip() for p in paragraphs if p.strip()) text = text.replace(_NBSP, " ").replace(_REPLACEMENT_CHAR, "'").strip() return CommentItem( forum_id=self.forum_id, reg_title=reg_title, reg_desc=reg_desc, comment_id=comment_id, author=author, date=date, title=title, text=text, ) # ------------------------------------------------------------------ def _reg_context(self, response): # Page shows: Guidance Document Change: description text... label_node = response.xpath('//strong[contains(text(),"Change:")]') label_text = label_node.css("::text").get("").strip() # Collect all sibling text nodes following the label siblings = label_node.xpath("following-sibling::text()").getall() raw = " ".join(t.strip() for t in siblings if t.strip()) raw = raw.replace(_NBSP, " ").replace(_REPLACEMENT_CHAR, "'").strip() # reg_desc is the full description text reg_desc = raw # reg_title: text up to the first "was " clause or first 200 chars m = re.match(r"^(.+?)\s+(?:was |has |guidance document)", raw, re.IGNORECASE) reg_title = m.group(1).strip() if m else raw[:200] return reg_title, reg_desc def _last_page(self, response): hrefs = response.xpath( '//form[@name="page"]//a[contains(@href,"vpage.value=")]/@href' ).getall() pages = [ int(m.group(1)) for h in hrefs if (m := re.search(r"vpage\.value=(\d+)", h)) ] return max(pages) if pages else 1