Spider fetches ViewComments.cfm?GdocForumID=N with vPerPage=500, generates all page requests from page-1 metadata, and parses each div.Cbox for comment_id, author, date, title, text, reg_title, reg_desc. Handles span-wrapped comment text. Fixes UTF-8/windows-1251 meta-tag encoding mismatch. 9083 items, 15 empty-text (0.17%). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
228 lines
7.3 KiB
Python
228 lines
7.3 KiB
Python
"""Tests for ForumSpider parsing logic using fake HTML responses."""
|
|
|
|
from scrapy.http import HtmlResponse, Request
|
|
|
|
from scraper.spiders.forum import ForumSpider
|
|
|
|
|
|
def fake_response(url, body, meta=None):
|
|
req = Request(url=url, meta=meta or {})
|
|
return HtmlResponse(url=url, body=body.encode("utf-8"), request=req)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Minimal page HTML fragments
|
|
|
|
PAGE1_HTML = """
|
|
<html><body>
|
|
<strong>Guidance Document Change:</strong> The Model Policies for the Treatment of Transgender Students
|
|
was developed in response to House Bill 145 and Senate Bill 161.
|
|
|
|
<div style="font-family: verdana;">
|
|
<form name="page" id="page" action="ViewComments.cfm?GdocForumID=452" method="post">
|
|
<input name="vPage" id="vpage" type="input" value="1">
|
|
<input name="vPerPage" id="vPerPage" type="input" value="500">
|
|
<a href="javascript:document.page.vpage.value=3;document.page.submit();">3</a>
|
|
<a href="javascript:document.page.vpage.value=2;document.page.submit();">Next</a>
|
|
<input type="submit" name="sub1" value="go">
|
|
</form>
|
|
</div>
|
|
|
|
<div id="cbox101" class="Cbox">
|
|
<div style="float: right; text-align: right;">
|
|
<div style="background-color: white; border: 1px solid #cccccc; padding: 4px">1/4/21 9:15 am</div>
|
|
</div>
|
|
<div>
|
|
<strong>Commenter:</strong>
|
|
Alice Example
|
|
<br><br>
|
|
<strong>I strongly support this</strong>
|
|
</div>
|
|
<div style="clear: right"> </div>
|
|
<div class="divComment">
|
|
<p>This is a great policy for students.</p>
|
|
<p>All schools should follow it.</p>
|
|
</div>
|
|
<div style="float: left; font-size: 90%;">
|
|
CommentID: <a href="ViewComments.cfm?commentid=101">101</a>
|
|
</div>
|
|
</div>
|
|
|
|
<div id="cbox102" class="Cbox">
|
|
<div style="float: right; text-align: right;">
|
|
<div style="background-color: white; border: 1px solid #cccccc; padding: 4px">1/5/21 10:00 am</div>
|
|
</div>
|
|
<div>
|
|
<strong>Commenter:</strong>
|
|
Bob Sample
|
|
<br><br>
|
|
<strong>Opposed</strong>
|
|
</div>
|
|
<div style="clear: right"> </div>
|
|
<div class="divComment">
|
|
<p>I do not support this guidance.</p>
|
|
</div>
|
|
<div style="float: left; font-size: 90%;">
|
|
CommentID: <a href="ViewComments.cfm?commentid=102">102</a>
|
|
</div>
|
|
</div>
|
|
</body></html>
|
|
"""
|
|
|
|
PAGE2_HTML = """
|
|
<html><body>
|
|
<div id="cbox201" class="Cbox">
|
|
<div style="float: right; text-align: right;">
|
|
<div style="background-color: white; border: 1px solid #cccccc; padding: 4px">1/6/21 11:00 am</div>
|
|
</div>
|
|
<div>
|
|
<strong>Commenter:</strong>
|
|
Carol T
|
|
<br><br>
|
|
<strong>Support</strong>
|
|
</div>
|
|
<div style="clear: right"> </div>
|
|
<div class="divComment">
|
|
<p>This policy is long overdue.</p>
|
|
</div>
|
|
</div>
|
|
</body></html>
|
|
"""
|
|
|
|
|
|
def make_spider():
|
|
return ForumSpider()
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
|
|
def test_page1_generates_remaining_page_requests():
|
|
spider = make_spider()
|
|
response = fake_response(
|
|
"https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
|
|
PAGE1_HTML,
|
|
meta={"is_first": True},
|
|
)
|
|
results = list(spider.parse_comments(response))
|
|
form_reqs = [r for r in results if isinstance(r, scrapy.FormRequest)]
|
|
# Pages 2 and 3 should be requested (last page link = 3)
|
|
assert len(form_reqs) == 2
|
|
pages = sorted(r.body.decode() for r in form_reqs)
|
|
assert "vPage=2" in pages[0]
|
|
assert "vPage=3" in pages[1]
|
|
|
|
|
|
def test_page1_yields_items():
|
|
spider = make_spider()
|
|
response = fake_response(
|
|
"https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
|
|
PAGE1_HTML,
|
|
meta={"is_first": True},
|
|
)
|
|
results = list(spider.parse_comments(response))
|
|
from scraper.items import CommentItem
|
|
items = [r for r in results if isinstance(r, CommentItem)]
|
|
assert len(items) == 2
|
|
|
|
|
|
def test_comment_fields_parsed_correctly():
|
|
spider = make_spider()
|
|
response = fake_response(
|
|
"https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
|
|
PAGE1_HTML,
|
|
meta={"is_first": True},
|
|
)
|
|
from scraper.items import CommentItem
|
|
items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)]
|
|
item = items[0]
|
|
assert item["comment_id"] == "101"
|
|
assert item["author"] == "Alice Example"
|
|
assert item["title"] == "I strongly support this"
|
|
assert "great policy" in item["text"]
|
|
assert "All schools" in item["text"] # multi-paragraph joined
|
|
assert "1/4/21" in item["date"]
|
|
|
|
|
|
def test_reg_context_extracted():
|
|
spider = make_spider()
|
|
response = fake_response(
|
|
"https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
|
|
PAGE1_HTML,
|
|
meta={"is_first": True},
|
|
)
|
|
from scraper.items import CommentItem
|
|
items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)]
|
|
item = items[0]
|
|
assert "Transgender Students" in item["reg_title"]
|
|
assert "House Bill 145" in item["reg_desc"]
|
|
|
|
|
|
def test_subsequent_page_uses_meta_reg_context():
|
|
spider = make_spider()
|
|
response = fake_response(
|
|
"https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
|
|
PAGE2_HTML,
|
|
meta={"reg_title": "Test Reg", "reg_desc": "Full description text"},
|
|
)
|
|
from scraper.items import CommentItem
|
|
items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)]
|
|
assert len(items) == 1
|
|
assert items[0]["reg_title"] == "Test Reg"
|
|
assert items[0]["author"] == "Carol T"
|
|
|
|
|
|
def test_last_page_detection():
|
|
spider = make_spider()
|
|
response = fake_response(
|
|
"https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
|
|
PAGE1_HTML,
|
|
meta={"is_first": True},
|
|
)
|
|
assert spider._last_page(response) == 3
|
|
|
|
|
|
import scrapy
|
|
|
|
SPAN_WRAPPED_HTML = """
|
|
<html><body>
|
|
<strong>Guidance Document Change:</strong> Some regulation was developed.
|
|
|
|
<form name="page" id="page" action="ViewComments.cfm?GdocForumID=452" method="post">
|
|
<input name="vPage" value="1"><input name="vPerPage" value="500">
|
|
<a href="javascript:document.page.vpage.value=1;document.page.submit();">1</a>
|
|
<input type="submit" name="sub1" value="go">
|
|
</form>
|
|
|
|
<div id="cbox301" class="Cbox">
|
|
<div style="float: right; text-align: right;">
|
|
<div style="background-color: white; border: 1px solid #cccccc; padding: 4px">2/1/21 8:00 am</div>
|
|
</div>
|
|
<div>
|
|
<strong>Commenter:</strong>
|
|
Dan Span
|
|
<br><br>
|
|
<strong>Opposed</strong>
|
|
</div>
|
|
<div style="clear: right"> </div>
|
|
<div class="divComment">
|
|
<!DOCTYPE html><html><head></head><body>
|
|
<p style="margin: 0in;"><span style="font-size: 10.5pt;">Text inside a span element.</span></p>
|
|
</body></html>
|
|
</div>
|
|
</div>
|
|
</body></html>
|
|
"""
|
|
|
|
|
|
def test_span_wrapped_text_is_extracted():
|
|
spider = make_spider()
|
|
response = fake_response(
|
|
"https://www.townhall.virginia.gov/L/ViewComments.cfm?GdocForumID=452",
|
|
SPAN_WRAPPED_HTML,
|
|
meta={"is_first": True},
|
|
)
|
|
from scraper.items import CommentItem
|
|
items = [r for r in spider.parse_comments(response) if isinstance(r, CommentItem)]
|
|
assert len(items) == 1
|
|
assert "Text inside a span element" in items[0]["text"]
|