diff --git a/.gitignore b/.gitignore index 993262f..5d8e9d0 100644 --- a/.gitignore +++ b/.gitignore @@ -22,5 +22,9 @@ env/ archive/ +# --- scrapy --- +.scrapy/ +output/ + # --- misc --- .DS_Store \ No newline at end of file diff --git a/docs/tasks.org b/docs/tasks.org index e6c0922..45cc4de 100644 --- a/docs/tasks.org +++ b/docs/tasks.org @@ -1,5 +1,8 @@ -* [ ] t1.1: scrape one forum (1) +* [X] t1.1: scrape one forum (1) Use https://www.townhall.virginia.gov/L/comments.cfm?GDocForumID=452 as the first forum. Scraper should be run manually at this step. +ViewComments (townhall.virginia.gov/L/ViewComments.cfm?CommentID=#) appears to be raw list of all comments on forum - could be useful later for whole-scrape +Append forum id to viewall per forum (townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452) +Comments are hydrated in backend via js-cued button (AJAX?) ** acceptance criteria 1. run manual scraper 1. store proposal title and description @@ -10,9 +13,9 @@ Use https://www.townhall.virginia.gov/L/comments.cfm?GDocForumID=452 as the firs ** notes ** evidence -- commit: -- tests: -- datetime: +- commit: (see below) +- tests: 7 passing (pytest tests/) +- datetime: 2026-05-05 12:26 * [ ] t1.2: initial analysis pipeline Write a simple pipeline for both - prefer non-concurrent/async from scraping run. Should be run manually, separate from scraper. You may use scrapy, but are not required to. diff --git a/scraper/items.py b/scraper/items.py index 97bbc0a..13f777f 100644 --- a/scraper/items.py +++ b/scraper/items.py @@ -1,12 +1,17 @@ -# Define here the models for your scraped items -# -# See documentation in: -# https://docs.scrapy.org/en/latest/topics/items.html - import scrapy -class ScraperItem(scrapy.Item): - # define the fields for your item here like: - # name = scrapy.Field() - pass +class CommentItem(scrapy.Item): + # Forum / regulation context + forum_id = scrapy.Field() + reg_title = scrapy.Field() + reg_desc = scrapy.Field() + + # Comment metadata + comment_id = scrapy.Field() + author = scrapy.Field() + date = scrapy.Field() + title = scrapy.Field() + + # Comment content + text = scrapy.Field() diff --git a/scraper/settings.py b/scraper/settings.py index 048bdb3..c86a95e 100644 --- a/scraper/settings.py +++ b/scraper/settings.py @@ -15,8 +15,7 @@ NEWSPIDER_MODULE = "scraper.spiders" ADDONS = {} -# Crawl responsibly by identifying yourself (and your website) on the user-agent -#USER_AGENT = "scraper (+http://www.yourdomain.com)" +USER_AGENT = "vath-research-scraper/1.0 (public comment analysis; contact: research)" # Obey robots.txt rules ROBOTSTXT_OBEY = True @@ -75,13 +74,24 @@ DOWNLOAD_DELAY = 1 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False -# Enable and configure HTTP caching (disabled by default) -# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings -#HTTPCACHE_ENABLED = True -#HTTPCACHE_EXPIRATION_SECS = 0 -#HTTPCACHE_DIR = "httpcache" -#HTTPCACHE_IGNORE_HTTP_CODES = [] -#HTTPCACHE_STORAGE = "scrapy.extensions.httpcache.FilesystemCacheStorage" +# HTTP cache — enabled during development to avoid re-hitting the server on test runs. +# Disable (or delete httpcache/) before a production run. +HTTPCACHE_ENABLED = True +HTTPCACHE_EXPIRATION_SECS = 86400 # 24 h +HTTPCACHE_DIR = "httpcache" + +# Output +FEEDS = { + "output/%(name)s_%(time)s.jsonl": { + "format": "jsonlines", + "encoding": "utf-8", + "overwrite": False, + } +} + +# The site declares windows-1251 in a meta tag but sends valid UTF-8 bytes. +# Force UTF-8 to prevent lxml from re-decoding via the meta charset. +DEFAULT_RESPONSE_ENCODING = "utf-8" # Set settings whose default value is deprecated to a future-proof value FEED_EXPORT_ENCODING = "utf-8" diff --git a/scraper/spiders/forum.py b/scraper/spiders/forum.py new file mode 100644 index 0000000..95f8864 --- /dev/null +++ b/scraper/spiders/forum.py @@ -0,0 +1,116 @@ +import math +import re +from urllib.parse import urlencode + +import scrapy + +from scraper.items import CommentItem + +_BASE = "https://www.townhall.virginia.gov/L/ViewComments.cfm" +_NBSP = "\xa0" +_REPLACEMENT_CHAR = "�" + + +def _view_url(forum_id): + return f"{_BASE}?GdocForumID={forum_id}" + + +class ForumSpider(scrapy.Spider): + name = "forum" + allowed_domains = ["townhall.virginia.gov"] + + # Override at runtime: scrapy crawl forum -a forum_id=452 + forum_id = "452" + per_page = 500 + + async def start(self): + yield scrapy.FormRequest( + _view_url(self.forum_id), + formdata={"vPage": "1", "vPerPage": str(self.per_page), "sub1": "go"}, + callback=self.parse_comments, + meta={"is_first": True}, + ) + + # ------------------------------------------------------------------ + def parse_comments(self, response): + if response.meta.get("is_first"): + reg_title, reg_desc = self._reg_context(response) + last_page = self._last_page(response) + for page in range(2, last_page + 1): + yield scrapy.FormRequest( + _view_url(self.forum_id), + formdata={"vPage": str(page), "vPerPage": str(self.per_page), "sub1": "go"}, + callback=self.parse_comments, + meta={"reg_title": reg_title, "reg_desc": reg_desc}, + ) + else: + reg_title = response.meta["reg_title"] + reg_desc = response.meta["reg_desc"] + + for box in response.css("div.Cbox"): + yield self._parse_box(box, reg_title, reg_desc) + + # ------------------------------------------------------------------ + def _parse_box(self, box, reg_title, reg_desc): + cbox_id = box.attrib.get("id", "") + comment_id = cbox_id[len("cbox"):] if cbox_id.startswith("cbox") else "" + + date = ( + box.css("div[style*='float: right'] div::text").get("") + .replace(_NBSP, " ").strip() + ) + + author = ( + box.xpath('.//strong[contains(text(),"Commenter:")]/following-sibling::text()[1]') + .get("").strip() + ) + + # Second in the commenter block is the comment title + strongs = box.css("div > strong::text").getall() + title = strongs[-1].strip() if len(strongs) > 1 else "" + + paragraphs = box.css(".divComment *::text, .divComment::text").getall() + text = " ".join(p.strip() for p in paragraphs if p.strip()) + text = text.replace(_NBSP, " ").replace(_REPLACEMENT_CHAR, "'").strip() + + return CommentItem( + forum_id=self.forum_id, + reg_title=reg_title, + reg_desc=reg_desc, + comment_id=comment_id, + author=author, + date=date, + title=title, + text=text, + ) + + # ------------------------------------------------------------------ + def _reg_context(self, response): + # Page shows: Guidance Document Change: description text... + label_node = response.xpath('//strong[contains(text(),"Change:")]') + label_text = label_node.css("::text").get("").strip() + + # Collect all sibling text nodes following the label + siblings = label_node.xpath("following-sibling::text()").getall() + raw = " ".join(t.strip() for t in siblings if t.strip()) + raw = raw.replace(_NBSP, " ").replace(_REPLACEMENT_CHAR, "'").strip() + + # reg_desc is the full description text + reg_desc = raw + + # reg_title: text up to the first "was " clause or first 200 chars + m = re.match(r"^(.+?)\s+(?:was |has |guidance document)", raw, re.IGNORECASE) + reg_title = m.group(1).strip() if m else raw[:200] + + return reg_title, reg_desc + + def _last_page(self, response): + hrefs = response.xpath( + '//form[@name="page"]//a[contains(@href,"vpage.value=")]/@href' + ).getall() + pages = [ + int(m.group(1)) + for h in hrefs + if (m := re.search(r"vpage\.value=(\d+)", h)) + ] + return max(pages) if pages else 1 diff --git a/tests/test_forum_spider.py b/tests/test_forum_spider.py new file mode 100644 index 0000000..bcfe594 --- /dev/null +++ b/tests/test_forum_spider.py @@ -0,0 +1,227 @@ +"""Tests for ForumSpider parsing logic using fake HTML responses.""" + +from scrapy.http import HtmlResponse, Request + +from scraper.spiders.forum import ForumSpider + + +def fake_response(url, body, meta=None): + req = Request(url=url, meta=meta or {}) + return HtmlResponse(url=url, body=body.encode("utf-8"), request=req) + + +# --------------------------------------------------------------------------- +# Minimal page HTML fragments + +PAGE1_HTML = """ + + Guidance Document Change: The Model Policies for the Treatment of Transgender Students + was developed in response to House Bill 145 and Senate Bill 161. + +
+
+ + + 3 + Next + +
+
+ +
+
+
1/4/21  9:15 am
+
+
+ Commenter: + Alice Example +

+ I strongly support this +
+
 
+
+

This is a great policy for students.

+

All schools should follow it.

+
+
+ CommentID: 101 +
+
+ +
+
+
1/5/21  10:00 am
+
+
+ Commenter: + Bob Sample +

+ Opposed +
+
 
+
+

I do not support this guidance.

+
+
+ CommentID: 102 +
+
+ +""" + +PAGE2_HTML = """ + +
+
+
1/6/21  11:00 am
+
+
+ Commenter: + Carol T +

+ Support +
+
 
+
+

This policy is long overdue.

+
+
+ +""" + + +def make_spider(): + return ForumSpider() + + +# --------------------------------------------------------------------------- + +def test_page1_generates_remaining_page_requests(): + spider = make_spider() + response = fake_response( + "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", + PAGE1_HTML, + meta={"is_first": True}, + ) + results = list(spider.parse_comments(response)) + form_reqs = [r for r in results if isinstance(r, scrapy.FormRequest)] + # Pages 2 and 3 should be requested (last page link = 3) + assert len(form_reqs) == 2 + pages = sorted(r.body.decode() for r in form_reqs) + assert "vPage=2" in pages[0] + assert "vPage=3" in pages[1] + + +def test_page1_yields_items(): + spider = make_spider() + response = fake_response( + "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", + PAGE1_HTML, + meta={"is_first": True}, + ) + results = list(spider.parse_comments(response)) + from scraper.items import CommentItem + items = [r for r in results if isinstance(r, CommentItem)] + assert len(items) == 2 + + +def test_comment_fields_parsed_correctly(): + spider = make_spider() + response = fake_response( + "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", + PAGE1_HTML, + meta={"is_first": True}, + ) + from scraper.items import CommentItem + items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)] + item = items[0] + assert item["comment_id"] == "101" + assert item["author"] == "Alice Example" + assert item["title"] == "I strongly support this" + assert "great policy" in item["text"] + assert "All schools" in item["text"] # multi-paragraph joined + assert "1/4/21" in item["date"] + + +def test_reg_context_extracted(): + spider = make_spider() + response = fake_response( + "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", + PAGE1_HTML, + meta={"is_first": True}, + ) + from scraper.items import CommentItem + items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)] + item = items[0] + assert "Transgender Students" in item["reg_title"] + assert "House Bill 145" in item["reg_desc"] + + +def test_subsequent_page_uses_meta_reg_context(): + spider = make_spider() + response = fake_response( + "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", + PAGE2_HTML, + meta={"reg_title": "Test Reg", "reg_desc": "Full description text"}, + ) + from scraper.items import CommentItem + items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)] + assert len(items) == 1 + assert items[0]["reg_title"] == "Test Reg" + assert items[0]["author"] == "Carol T" + + +def test_last_page_detection(): + spider = make_spider() + response = fake_response( + "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", + PAGE1_HTML, + meta={"is_first": True}, + ) + assert spider._last_page(response) == 3 + + +import scrapy + +SPAN_WRAPPED_HTML = """ + + Guidance Document Change: Some regulation was developed. + +
+ + 1 + +
+ +
+
+
2/1/21  8:00 am
+
+
+ Commenter: + Dan Span +

+ Opposed +
+
 
+
+ +

Text inside a span element.

+ +
+
+ +""" + + +def test_span_wrapped_text_is_extracted(): + spider = make_spider() + response = fake_response( + "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", + SPAN_WRAPPED_HTML, + meta={"is_first": True}, + ) + from scraper.items import CommentItem + items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)] + assert len(items) == 1 + assert "Text inside a span element" in items[0]["text"]