"""Tests for ForumSpider parsing logic using fake HTML responses.""" import scrapy from scrapy.http import HtmlResponse, Request from scraper.items import CommentItem, ForumItem from scraper.spiders.forum import ForumSpider, _parse_date def fake_response(url, body, meta=None): req = Request(url=url, meta=meta or {}) return HtmlResponse(url=url, body=body.encode("utf-8"), request=req) # --------------------------------------------------------------------------- # Minimal page HTML fragments PAGE1_HTML = """ Guidance Document Change: The Model Policies for the Treatment of Transgender Students was developed in response to House Bill 145 and Senate Bill 161.
3 Next
1/4/21  9:15 am
Commenter: Alice Example

I strongly support this
 

This is a great policy for students.

All schools should follow it.

CommentID: 101
1/5/21  10:00 am
Commenter: Bob Sample

Opposed
 

I do not support this guidance.

CommentID: 102
""" PAGE2_HTML = """
1/6/21  11:00 am
Commenter: Carol T

Support
 

This policy is long overdue.

""" def make_spider(): return ForumSpider() # --------------------------------------------------------------------------- def test_page1_generates_remaining_page_requests(): spider = make_spider() response = fake_response( "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", PAGE1_HTML, meta={"is_first": True}, ) results = list(spider.parse_comments(response)) form_reqs = [r for r in results if isinstance(r, scrapy.FormRequest)] # Pages 2 and 3 should be requested (last page link = 3) assert len(form_reqs) == 2 pages = sorted(r.body.decode() for r in form_reqs) assert "vPage=2" in pages[0] assert "vPage=3" in pages[1] def test_page1_yields_items(): spider = make_spider() response = fake_response( "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", PAGE1_HTML, meta={"is_first": True}, ) results = list(spider.parse_comments(response)) items = [r for r in results if isinstance(r, CommentItem)] assert len(items) == 2 def test_page1_yields_forum_item(): spider = make_spider() response = fake_response( "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", PAGE1_HTML, meta={"is_first": True}, ) results = list(spider.parse_comments(response)) forum_items = [r for r in results if isinstance(r, ForumItem)] assert len(forum_items) == 1 fi = forum_items[0] assert "Transgender Students" in fi["reg_title"] assert "House Bill 145" in fi["reg_desc"] assert fi["forum_id"] == "452" def test_comment_fields_parsed_correctly(): spider = make_spider() response = fake_response( "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", PAGE1_HTML, meta={"is_first": True}, ) items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)] item = items[0] assert item["comment_id"] == "101" assert item["author"] == "Alice Example" assert item["title"] == "I strongly support this" assert "great policy" in item["text"] assert "All schools" in item["text"] # multi-paragraph joined assert "reg_title" not in item assert "reg_desc" not in item def test_subsequent_page_yields_comments(): spider = make_spider() response = fake_response( "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", PAGE2_HTML, ) items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)] assert len(items) == 1 assert items[0]["author"] == "Carol T" def test_last_page_detection(): spider = make_spider() response = fake_response( "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", PAGE1_HTML, meta={"is_first": True}, ) assert spider._last_page(response) == 3 def test_date_parsed_to_iso(): assert _parse_date("1/4/21 9:15 am") == "2021-01-04T09:15:00" assert _parse_date("1/5/21 10:00 am") == "2021-01-05T10:00:00" assert _parse_date("unparseable") == "unparseable" SPAN_WRAPPED_HTML = """ Guidance Document Change: Some regulation was developed.
1
2/1/21  8:00 am
Commenter: Dan Span

Opposed
 

Text inside a span element.

""" def test_span_wrapped_text_is_extracted(): spider = make_spider() response = fake_response( "https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452", SPAN_WRAPPED_HTML, meta={"is_first": True}, ) items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)] assert len(items) == 1 assert "Text inside a span element" in items[0]["text"]