"""Tests for ForumSpider parsing logic using fake HTML responses.""" from scrapy.http import HtmlResponse, Request from scraper.spiders.forum import ForumSpider def fake_response(url, body, meta=None): req = Request(url=url, meta=meta or {}) return HtmlResponse(url=url, body=body.encode("utf-8"), request=req) # --------------------------------------------------------------------------- # Minimal page HTML fragments PAGE1_HTML = """ Guidance Document Change: The Model Policies for the Treatment of Transgender Students was developed in response to House Bill 145 and Senate Bill 161.
3 Next
1/4/21  9:15 am
Commenter: Alice Example

I strongly support this
 

This is a great policy for students.

All schools should follow it.

CommentID: 101
1/5/21  10:00 am
Commenter: Bob Sample

Opposed
 

I do not support this guidance.

CommentID: 102
""" PAGE2_HTML = """
1/6/21  11:00 am
Commenter: Carol T

Support
 

This policy is long overdue.

""" def make_spider(): return ForumSpider() # --------------------------------------------------------------------------- def test_page1_generates_remaining_page_requests(): spider = make_spider() response = fake_response( "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", PAGE1_HTML, meta={"is_first": True}, ) results = list(spider.parse_comments(response)) form_reqs = [r for r in results if isinstance(r, scrapy.FormRequest)] # Pages 2 and 3 should be requested (last page link = 3) assert len(form_reqs) == 2 pages = sorted(r.body.decode() for r in form_reqs) assert "vPage=2" in pages[0] assert "vPage=3" in pages[1] def test_page1_yields_items(): spider = make_spider() response = fake_response( "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", PAGE1_HTML, meta={"is_first": True}, ) results = list(spider.parse_comments(response)) from scraper.items import CommentItem items = [r for r in results if isinstance(r, CommentItem)] assert len(items) == 2 def test_comment_fields_parsed_correctly(): spider = make_spider() response = fake_response( "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", PAGE1_HTML, meta={"is_first": True}, ) from scraper.items import CommentItem items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)] item = items[0] assert item["comment_id"] == "101" assert item["author"] == "Alice Example" assert item["title"] == "I strongly support this" assert "great policy" in item["text"] assert "All schools" in item["text"] # multi-paragraph joined assert "1/4/21" in item["date"] def test_reg_context_extracted(): spider = make_spider() response = fake_response( "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", PAGE1_HTML, meta={"is_first": True}, ) from scraper.items import CommentItem items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)] item = items[0] assert "Transgender Students" in item["reg_title"] assert "House Bill 145" in item["reg_desc"] def test_subsequent_page_uses_meta_reg_context(): spider = make_spider() response = fake_response( "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", PAGE2_HTML, meta={"reg_title": "Test Reg", "reg_desc": "Full description text"}, ) from scraper.items import CommentItem items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)] assert len(items) == 1 assert items[0]["reg_title"] == "Test Reg" assert items[0]["author"] == "Carol T" def test_last_page_detection(): spider = make_spider() response = fake_response( "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", PAGE1_HTML, meta={"is_first": True}, ) assert spider._last_page(response) == 3 import scrapy SPAN_WRAPPED_HTML = """ Guidance Document Change: Some regulation was developed.
1
2/1/21  8:00 am
Commenter: Dan Span

Opposed
 

Text inside a span element.

""" def test_span_wrapped_text_is_extracted(): spider = make_spider() response = fake_response( "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", SPAN_WRAPPED_HTML, meta={"is_first": True}, ) from scraper.items import CommentItem items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)] assert len(items) == 1 assert "Text inside a span element" in items[0]["text"]