diff --git a/docs/tasks.org b/docs/tasks.org index 593c5c5..9e64b8b 100644 --- a/docs/tasks.org +++ b/docs/tasks.org @@ -2,20 +2,36 @@ Use https://www.townhall.virginia.gov/L/comments.cfm?GDocForumID=452 as the first forum. Scraper should be run manually at this step. ViewComments (townhall.virginia.gov/L/ViewComments.cfm?CommentID=#) appears to be raw list of all comments on forum - could be useful later for whole-scrape Append forum id to viewall per forum (townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452) -Comments are hydrated in backend via js-cued button (AJAX?) +Comments are hydrated in backend via js-cued button (AJAX?). ** acceptance criteria 1. run manual scraper 1. store proposal title and description 2. store comment title, commenter, date 3. store relevant metadata 2. friendly/polite scraping - +3. store forum as distinct item with title, desc +4. add forum ID in comment filename, eg forum452_comments_.jsonl +5. remove reg_title and reg_desc from each comment; these belong in forum item +6. parse datetimes into object for later use (plotting) + ** notes +- scraper/spiders/forum.py — ForumSpider using ViewComments.cfm?GdocForumID=N with POST pagination. First request fetches page 1 (vPerPage=500), discovers the last page number from the form's link, generates all remaining page requests upfront. Parses each div.Cbox for all required fields. +- scraper/items.py — CommentItem with forum_id, reg_title, reg_desc, comment_id, author, date, title, text +- tests/test_forum_spider.py — 7 tests, all passing +- Settings: DEFAULT_RESPONSE_ENCODING=utf-8 (fixes Windows-1251 meta-tag mismatch), HTTPCACHE_ENABLED=True, feed output to output/ +- ViewComments.cfm instead of comments.cfm: POST to Comments.cfm returned a 500 error (wrong endpoint). ViewComments.cfm?GdocForumID=N is the correct listing URL, returns full comment text on the page itself — no per-comment follow requests needed. +- Span-wrapped text: .divComment p::text missed 3.6% of comments where text is in

text

. Fixed to .divComment *::text, .divComment::text. Worth knowing for when the spider is extended to other forums. +- start() vs start_requests(): Scrapy 2.13+ deprecates start_requests() in favor of async def start() +- ForumItem vs CommentItem: ForumItem (forum_id, reg_title, reg_desc) yielded once on first page; CommentItem no longer carries reg_title/reg_desc. Both land in the same JSONL feed. +- Dynamic output filename: set via from_crawler() overriding FEEDS at 'spider' priority — format is output/forum{id}_comments_%(time)s.jsonl. FEEDS removed from settings.py; spider owns it. +- Date parsing: _parse_date() normalizes whitespace, upper-cases, parses "%m/%d/%y %I:%M %p" → ISO 8601; falls back to raw string on failure. ** evidence -- commit: beb5cf4 -- tests: 7 passing (pytest tests/) -- datetime: 2026-05-05 12:26 +- commit: beb5cf4 (AC1-2), (AC3-6) +- tests: 8 passing (`python -m pytest tests -q`) or (`python -m pytest tests/`) + - `scrapy crawl forum -a forum_id=452 -s LOG_LEVEL=WARNING 2>&1` + - retrieved 9083 comments +- datetime: 2026-05-05 * [ ] t1.2: initial analysis pipeline Write a simple pipeline for both - prefer non-concurrent/async from scraping run. Should be run manually, separate from scraper. You may use scrapy, but are not required to. @@ -29,3 +45,10 @@ Write a simple pipeline for both - prefer non-concurrent/async from scraping run - commit: - tests: - date: + +* [ ] X: complete proposal information +Ensure we capture as much useful information as possible about the actual proposal - contact information, etc. what the state actually says about what was posted. +** acceptance criteria +1. Item: `Forum` stores id, url, proposal title, description, open/close date, number of comments, agency, board, guidance document id + - add details for guidanceDoc, publication date, comments, guidance docs - eg: https://www.townhall.virginia.gov/L/GDocForum.cfm?GDocForumID=452 +2. Item: `Comment` stores forum_id, comment_id, author, title, text, date, url diff --git a/scraper/items.py b/scraper/items.py index 13f777f..a5e6971 100644 --- a/scraper/items.py +++ b/scraper/items.py @@ -1,17 +1,16 @@ import scrapy -class CommentItem(scrapy.Item): - # Forum / regulation context - forum_id = scrapy.Field() - reg_title = scrapy.Field() - reg_desc = scrapy.Field() +class ForumItem(scrapy.Item): + forum_id = scrapy.Field() + reg_title = scrapy.Field() + reg_desc = scrapy.Field() - # Comment metadata + +class CommentItem(scrapy.Item): + forum_id = scrapy.Field() comment_id = scrapy.Field() author = scrapy.Field() date = scrapy.Field() title = scrapy.Field() - - # Comment content text = scrapy.Field() diff --git a/scraper/settings.py b/scraper/settings.py index c86a95e..15f0c1e 100644 --- a/scraper/settings.py +++ b/scraper/settings.py @@ -80,14 +80,7 @@ HTTPCACHE_ENABLED = True HTTPCACHE_EXPIRATION_SECS = 86400 # 24 h HTTPCACHE_DIR = "httpcache" -# Output -FEEDS = { - "output/%(name)s_%(time)s.jsonl": { - "format": "jsonlines", - "encoding": "utf-8", - "overwrite": False, - } -} +# Output filename is set dynamically by each spider via from_crawler (includes forum_id). # The site declares windows-1251 in a meta tag but sends valid UTF-8 bytes. # Force UTF-8 to prevent lxml from re-decoding via the meta charset. diff --git a/scraper/spiders/forum.py b/scraper/spiders/forum.py index 95f8864..910ec89 100644 --- a/scraper/spiders/forum.py +++ b/scraper/spiders/forum.py @@ -1,10 +1,9 @@ -import math import re -from urllib.parse import urlencode +from datetime import datetime import scrapy -from scraper.items import CommentItem +from scraper.items import CommentItem, ForumItem _BASE = "https://www.townhall.virginia.gov/L/ViewComments.cfm" _NBSP = "\xa0" @@ -15,6 +14,14 @@ def _view_url(forum_id): return f"{_BASE}?GdocForumID={forum_id}" +def _parse_date(raw): + normalized = " ".join(raw.split()).upper() + try: + return datetime.strptime(normalized, "%m/%d/%y %I:%M %p").isoformat() + except ValueError: + return raw + + class ForumSpider(scrapy.Spider): name = "forum" allowed_domains = ["townhall.virginia.gov"] @@ -23,6 +30,22 @@ class ForumSpider(scrapy.Spider): forum_id = "452" per_page = 500 + @classmethod + def from_crawler(cls, crawler, *args, **kwargs): + spider = super().from_crawler(crawler, *args, **kwargs) + crawler.settings.set( + "FEEDS", + { + f"output/forum{spider.forum_id}_comments_%(time)s.jsonl": { + "format": "jsonlines", + "encoding": "utf-8", + "overwrite": False, + } + }, + priority="spider", + ) + return spider + async def start(self): yield scrapy.FormRequest( _view_url(self.forum_id), @@ -36,26 +59,27 @@ class ForumSpider(scrapy.Spider): if response.meta.get("is_first"): reg_title, reg_desc = self._reg_context(response) last_page = self._last_page(response) + yield ForumItem( + forum_id=self.forum_id, + reg_title=reg_title, + reg_desc=reg_desc, + ) for page in range(2, last_page + 1): yield scrapy.FormRequest( _view_url(self.forum_id), formdata={"vPage": str(page), "vPerPage": str(self.per_page), "sub1": "go"}, callback=self.parse_comments, - meta={"reg_title": reg_title, "reg_desc": reg_desc}, ) - else: - reg_title = response.meta["reg_title"] - reg_desc = response.meta["reg_desc"] for box in response.css("div.Cbox"): - yield self._parse_box(box, reg_title, reg_desc) + yield self._parse_box(box) # ------------------------------------------------------------------ - def _parse_box(self, box, reg_title, reg_desc): + def _parse_box(self, box): cbox_id = box.attrib.get("id", "") comment_id = cbox_id[len("cbox"):] if cbox_id.startswith("cbox") else "" - date = ( + date_raw = ( box.css("div[style*='float: right'] div::text").get("") .replace(_NBSP, " ").strip() ) @@ -75,11 +99,9 @@ class ForumSpider(scrapy.Spider): return CommentItem( forum_id=self.forum_id, - reg_title=reg_title, - reg_desc=reg_desc, comment_id=comment_id, author=author, - date=date, + date=_parse_date(date_raw), title=title, text=text, ) @@ -88,14 +110,12 @@ class ForumSpider(scrapy.Spider): def _reg_context(self, response): # Page shows: Guidance Document Change: description text... label_node = response.xpath('//strong[contains(text(),"Change:")]') - label_text = label_node.css("::text").get("").strip() # Collect all sibling text nodes following the label siblings = label_node.xpath("following-sibling::text()").getall() raw = " ".join(t.strip() for t in siblings if t.strip()) raw = raw.replace(_NBSP, " ").replace(_REPLACEMENT_CHAR, "'").strip() - # reg_desc is the full description text reg_desc = raw # reg_title: text up to the first "was " clause or first 200 chars diff --git a/tests/test_forum_spider.py b/tests/test_forum_spider.py index bcfe594..89ef38f 100644 --- a/tests/test_forum_spider.py +++ b/tests/test_forum_spider.py @@ -1,8 +1,10 @@ """Tests for ForumSpider parsing logic using fake HTML responses.""" +import scrapy from scrapy.http import HtmlResponse, Request -from scraper.spiders.forum import ForumSpider +from scraper.items import CommentItem, ForumItem +from scraper.spiders.forum import ForumSpider, _parse_date def fake_response(url, body, meta=None): @@ -120,11 +122,26 @@ def test_page1_yields_items(): meta={"is_first": True}, ) results = list(spider.parse_comments(response)) - from scraper.items import CommentItem items = [r for r in results if isinstance(r, CommentItem)] assert len(items) == 2 +def test_page1_yields_forum_item(): + spider = make_spider() + response = fake_response( + "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", + PAGE1_HTML, + meta={"is_first": True}, + ) + results = list(spider.parse_comments(response)) + forum_items = [r for r in results if isinstance(r, ForumItem)] + assert len(forum_items) == 1 + fi = forum_items[0] + assert "Transgender Students" in fi["reg_title"] + assert "House Bill 145" in fi["reg_desc"] + assert fi["forum_id"] == "452" + + def test_comment_fields_parsed_correctly(): spider = make_spider() response = fake_response( @@ -132,7 +149,6 @@ def test_comment_fields_parsed_correctly(): PAGE1_HTML, meta={"is_first": True}, ) - from scraper.items import CommentItem items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)] item = items[0] assert item["comment_id"] == "101" @@ -140,34 +156,18 @@ def test_comment_fields_parsed_correctly(): assert item["title"] == "I strongly support this" assert "great policy" in item["text"] assert "All schools" in item["text"] # multi-paragraph joined - assert "1/4/21" in item["date"] + assert "reg_title" not in item + assert "reg_desc" not in item -def test_reg_context_extracted(): - spider = make_spider() - response = fake_response( - "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", - PAGE1_HTML, - meta={"is_first": True}, - ) - from scraper.items import CommentItem - items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)] - item = items[0] - assert "Transgender Students" in item["reg_title"] - assert "House Bill 145" in item["reg_desc"] - - -def test_subsequent_page_uses_meta_reg_context(): +def test_subsequent_page_yields_comments(): spider = make_spider() response = fake_response( "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", PAGE2_HTML, - meta={"reg_title": "Test Reg", "reg_desc": "Full description text"}, ) - from scraper.items import CommentItem items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)] assert len(items) == 1 - assert items[0]["reg_title"] == "Test Reg" assert items[0]["author"] == "Carol T" @@ -181,7 +181,11 @@ def test_last_page_detection(): assert spider._last_page(response) == 3 -import scrapy +def test_date_parsed_to_iso(): + assert _parse_date("1/4/21 9:15 am") == "2021-01-04T09:15:00" + assert _parse_date("1/5/21 10:00 am") == "2021-01-05T10:00:00" + assert _parse_date("unparseable") == "unparseable" + SPAN_WRAPPED_HTML = """ @@ -221,7 +225,6 @@ def test_span_wrapped_text_is_extracted(): SPAN_WRAPPED_HTML, meta={"is_first": True}, ) - from scraper.items import CommentItem items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)] assert len(items) == 1 assert "Text inside a span element" in items[0]["text"]