updated instructions and added artifacts

This commit is contained in:
2026-05-05 13:50:27 -04:00
parent e7df0b24a1
commit 314f8d2621
4 changed files with 225 additions and 13 deletions

View File

@@ -5,24 +5,24 @@
- prefer minimal diffs; avoid refactors unless required for the active task
## tech stack
- python; scrapy
- python; scrapy, pytest
- file storage: json or csv
- assume local virtual env is available and accessible
- do not add new dependencies unless explicitly approved; if unavoidable, document justification in the active task notes
## workflow
- prefer direct argv commands (no bash -lc / compound shell chains) unless necessary
- work on ONE task at a time unless explicitly instructed otherwise
- at the start of work, state the task id you are executing
- do not start work unless a task id is specified; if missing, choose the earliest unchecked task and say so
- propose incremental steps
- always include basic tests for core logic
- when you complete a task:
- prefer direct commands
- work on ONE task at a time unless explicitly instructed otherwise:
- at the start of work, state the task id you are executing
- do not start work unless a task id is specified; if missing, choose the earliest unchecked task and say so
- propose incremental steps
- always include basic tests for core logic
- when you complete a task:
- mark it [X] in docs/tasks.md
- fill in evidence with commit hash + commands run
- never mark complete unless acceptance criteria are met
- include date and time (HH:MM)
- follow this format:
```
* [ ] t1.1 Task Title (1)
Description and PM notes

105
docs/tb.py Normal file
View File

@@ -0,0 +1,105 @@
import jsonlines
import re
from textblob import TextBlob
from collections import Counter
def tprint(obj):
print(f"{type(obj)} : {obj}")
def sort_file(file):
'''return number of positive and negative comments based on TextBlob sentiment analysis'''
# with jsonlines.open("/vadoe/vadoe/vadoe/townhall_2021-01-14T02-05-51.json") as reader:
with jsonlines.open(file, mode='r') as reader:
# Confirm type
tprint(reader)
# Build iterator
_doc = iter(reader)
i = 0
pos = 0
neg = 0
posl = []
negl = []
while i<25:
_line = next(_doc)
tprint(_line)
if _line['sentiment'] == 'pos':
pos = pos + 1
posl.append(_line['comment'])
elif _line['sentiment'] == 'neg':
neg = neg + 1
negl.append(_line['comment'])
i=i+1
print(f'{pos} positive and {neg} negative comments')
# tst = TextBlob(obj['comment'])
# tst.sentiment
def process_file(file):
'''Find Smythers posts'''
with jsonlines.open(file, mode='r') as reader:
_doc = iter(reader)
_list = []
for item in _doc:
try:
if item['author'][0] == 'Smythers':
_list.append(item['content'][0])
except KeyError:
continue
return(_list)
def write_file(file, data:object):
'''Write data to file'''
with jsonlines.open(file, mode='w') as writer:
for each in data:
writer.write(each)
print('write successful')
def clean_text(text:str):
s1 = remove_html(text)
s2 = remove_http(s1)
return s2
def remove_html(text:str):
'''Remove html tags from string'''
clean = re.compile('<.*?>')
return re.sub(clean, '', text)
def remove_http(text:str):
'''Remove URLs from string'''
return re.sub(r'http\S+','', text)
def get_nouns(text:str):
blob = TextBlob(text)
# check nouns? or no
return blob.tags
vadoe = '/vadoe/vadoe/vadoe/townhall_2021-01-14T02-05-51.json'
vadoe_p = '/vadoe/vadoe/vadoe/townhall_2021-01-14T05-11-55.json'
dlr = '/vadoe/vadoe/vadoe/dlr.json'
smythers_pc = '/vadoe/vadoe/vadoe/smythers.json'
write_to = '/vadoe/vadoe/vadoe/nouns.json'
# processed_file(file)
smythers_posts = process_file(dlr)
# cleaned = []
# for each in smythers:
# cleaned.append(clean_text(each))
cleaned = [clean_text(each) for each in smythers_posts]
nouns = []
for x in cleaned:
_list = get_nouns(x)
for y in _list:
nouns.append(y)
# nouns.append(x for x in [get_nouns())
sortedNouns = Counter(nouns)
nouns = []
for k, v in sortedNouns.items():
if v > 2:
_d = (k, v)
nouns.append(_d)
print(nouns)
write_file(write_to, nouns)

45
docs/townhall.py Normal file
View File

@@ -0,0 +1,45 @@
# -*- coding: utf-8 -*-
import scrapy
from items import CommentItem
import textblob
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
class TownhallSpider(scrapy.Spider):
name = 'townhall'
allowed_domains = ['townhall.virginia.gov']
start_urls = ['https://www.townhall.virginia.gov/L/comments.cfm?GDocForumID=452']
custom_settings = {
'FEED_EXPORTERS' : {
"jsonlines": "scrapy.exporters.JsonLinesItemExporter",
},
'FEED_URI' : '%(name)s_%(time)s.json',
'FEED_FORMAT': 'jsonlines'
}
def parse(self, response):
rows = response.css('#contentwide>table>tr')
# cut out the header row
for each in rows[1:]:
# for each in rows[1:6]:
cols = each.xpath('.//td')
linkfollow = cols[0].css('a::attr(href)').get()
comment_title = cols[0].xpath('a/text()').get()
# clean up
commenter = cols[1].xpath('text()').get()
# clean up
date = cols[2].xpath('a/text()').get()
print(f'{comment_title} | {commenter}')
yield response.follow(linkfollow, callback = self.parse_comment)
def parse_comment(self, response):
entry = CommentItem()
text = response.css('.divComment>p::text').get()
text = text.replace(u'\u00a0',' ')
entry['comment'] = text
blob = TextBlob(text, analyzer=NaiveBayesAnalyzer())
entry['sentiment'] = blob.sentiment.classification
entry['sentiment_pos'] = blob.sentiment.p_pos
entry['sentiment_neg'] = blob.sentiment.p_neg
# yield CommentItem(comment = response.css('.divComment>p::text').get())
yield entry

62
docs/townhall2.py Normal file
View File

@@ -0,0 +1,62 @@
# -*- coding: utf-8 -*-
import scrapy
from items import CommentItem
import textblob
from textblob import TextBlob
from textblob.sentiments import NaiveBayesAnalyzer
class TownhallSpider(scrapy.Spider):
name = 'townhall'
allowed_domains = ['townhall.virginia.gov']
start_urls = ['https://www.townhall.virginia.gov/L/Forums.cfm']
custom_settings = {
'FEED_EXPORTERS' : {
"jsonlines": "scrapy.exporters.JsonLinesItemExporter",
},
'FEED_URI' : '%(name)s_%(time)s.json',
'FEED_FORMAT': 'jsonlines'
}
def parse(self, response):
rows = response.css('table>tr>td')
for each in rows:
linkfollow = each.css('a').attrib['href']
if 'comments' in linkfollow:
yield response.follow(linkfollow, callback = self.parse_forum)
cols = each.xpath('.//td')
linkfollow = cols[0].css('a::attr(href)').get()
comment_title = cols[0].xpath('a/text()').get()
# clean up
commenter = cols[1].xpath('text()').get()
# clean up
date = cols[2].xpath('a/text()').get()
print(f'{comment_title} | {commenter}')
yield response.follow(linkfollow, callback = self.parse_comment)
def parse_forum(self, response):
rows = response.css('#contentwide>table>tr')
# cut out the header row
for each in rows[1:]:
# for each in rows[1:6]:
cols = each.xpath('.//td')
linkfollow = cols[0].css('a::attr(href)').get()
comment_title = cols[0].xpath('a/text()').get()
# clean up
commenter = cols[1].xpath('text()').get()
# clean up
date = cols[2].xpath('a/text()').get()
print(f'{comment_title} | {commenter}')
yield response.follow(linkfollow, callback = self.parse_comment)
def parse_comment(self, response):
entry = CommentItem()
text = response.css('.divComment>p::text').get()
text = text.replace(u'\u00a0',' ')
entry['comment'] = text
blob = TextBlob(text, analyzer=NaiveBayesAnalyzer())
entry['sentiment'] = blob.sentiment.classification
entry['sentiment_pos'] = blob.sentiment.p_pos
entry['sentiment_neg'] = blob.sentiment.p_neg
# yield CommentItem(comment = response.css('.divComment>p::text').get())
yield entry