"""Tests for ForumSpider parsing logic using fake HTML responses."""
from scrapy.http import HtmlResponse, Request
from scraper.spiders.forum import ForumSpider
def fake_response(url, body, meta=None):
req = Request(url=url, meta=meta or {})
return HtmlResponse(url=url, body=body.encode("utf-8"), request=req)
# ---------------------------------------------------------------------------
# Minimal page HTML fragments
PAGE1_HTML = """
Guidance Document Change: The Model Policies for the Treatment of Transgender Students
was developed in response to House Bill 145 and Senate Bill 161.
Commenter:
Alice Example
I strongly support this
Commenter:
Bob Sample
Opposed
"""
PAGE2_HTML = """
Commenter:
Carol T
Support
"""
def make_spider():
return ForumSpider()
# ---------------------------------------------------------------------------
def test_page1_generates_remaining_page_requests():
spider = make_spider()
response = fake_response(
"https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
PAGE1_HTML,
meta={"is_first": True},
)
results = list(spider.parse_comments(response))
form_reqs = [r for r in results if isinstance(r, scrapy.FormRequest)]
# Pages 2 and 3 should be requested (last page link = 3)
assert len(form_reqs) == 2
pages = sorted(r.body.decode() for r in form_reqs)
assert "vPage=2" in pages[0]
assert "vPage=3" in pages[1]
def test_page1_yields_items():
spider = make_spider()
response = fake_response(
"https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
PAGE1_HTML,
meta={"is_first": True},
)
results = list(spider.parse_comments(response))
from scraper.items import CommentItem
items = [r for r in results if isinstance(r, CommentItem)]
assert len(items) == 2
def test_comment_fields_parsed_correctly():
spider = make_spider()
response = fake_response(
"https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
PAGE1_HTML,
meta={"is_first": True},
)
from scraper.items import CommentItem
items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)]
item = items[0]
assert item["comment_id"] == "101"
assert item["author"] == "Alice Example"
assert item["title"] == "I strongly support this"
assert "great policy" in item["text"]
assert "All schools" in item["text"] # multi-paragraph joined
assert "1/4/21" in item["date"]
def test_reg_context_extracted():
spider = make_spider()
response = fake_response(
"https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
PAGE1_HTML,
meta={"is_first": True},
)
from scraper.items import CommentItem
items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)]
item = items[0]
assert "Transgender Students" in item["reg_title"]
assert "House Bill 145" in item["reg_desc"]
def test_subsequent_page_uses_meta_reg_context():
spider = make_spider()
response = fake_response(
"https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
PAGE2_HTML,
meta={"reg_title": "Test Reg", "reg_desc": "Full description text"},
)
from scraper.items import CommentItem
items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)]
assert len(items) == 1
assert items[0]["reg_title"] == "Test Reg"
assert items[0]["author"] == "Carol T"
def test_last_page_detection():
spider = make_spider()
response = fake_response(
"https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
PAGE1_HTML,
meta={"is_first": True},
)
assert spider._last_page(response) == 3
import scrapy
SPAN_WRAPPED_HTML = """
Guidance Document Change: Some regulation was developed.
Commenter:
Dan Span
Opposed
"""
def test_span_wrapped_text_is_extracted():
spider = make_spider()
response = fake_response(
"https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
SPAN_WRAPPED_HTML,
meta={"is_first": True},
)
from scraper.items import CommentItem
items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)]
assert len(items) == 1
assert "Text inside a span element" in items[0]["text"]