updated instructions and added artifacts

This commit is contained in:
2026-05-05 13:50:27 -04:00
parent e7df0b24a1
commit 314f8d2621
4 changed files with 225 additions and 13 deletions

45
docs/townhall.py Normal file
View File

@@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
import scrapy
from items import CommentItem
import textblob
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
class TownhallSpider(scrapy.Spider):
name = 'townhall'
allowed_domains = ['townhall.virginia.gov']
start_urls = ['https://www.townhall.virginia.gov/L/comments.cfm?GDocForumID=452']
custom_settings = {
'FEED_EXPORTERS' : {
"jsonlines": "scrapy.exporters.JsonLinesItemExporter",
},
'FEED_URI' : '%(name)s_%(time)s.json',
'FEED_FORMAT': 'jsonlines'
}
def parse(self, response):
rows = response.css('#contentwide>table>tr')
# cut out the header row
for each in rows[1:]:
# for each in rows[1:6]:
cols = each.xpath('.//td')
linkfollow = cols[0].css('a::attr(href)').get()
comment_title = cols[0].xpath('a/text()').get()
# clean up
commenter = cols[1].xpath('text()').get()
# clean up
date = cols[2].xpath('a/text()').get()
print(f'{comment_title} | {commenter}')
yield response.follow(linkfollow, callback = self.parse_comment)
def parse_comment(self, response):
entry = CommentItem()
text = response.css('.divComment>p::text').get()
text = text.replace(u'\u00a0',' ')
entry['comment'] = text
blob = TextBlob(text, analyzer=NaiveBayesAnalyzer())
entry['sentiment'] = blob.sentiment.classification
entry['sentiment_pos'] = blob.sentiment.p_pos
entry['sentiment_neg'] = blob.sentiment.p_neg
# yield CommentItem(comment = response.css('.divComment>p::text').get())
yield entry