updated instructions and added artifacts
This commit is contained in:
@@ -5,14 +5,14 @@
|
||||
- prefer minimal diffs; avoid refactors unless required for the active task
|
||||
|
||||
## tech stack
|
||||
- python; scrapy
|
||||
- python; scrapy, pytest
|
||||
- file storage: json or csv
|
||||
- assume local virtual env is available and accessible
|
||||
- do not add new dependencies unless explicitly approved; if unavoidable, document justification in the active task notes
|
||||
|
||||
## workflow
|
||||
- prefer direct argv commands (no bash -lc / compound shell chains) unless necessary
|
||||
- work on ONE task at a time unless explicitly instructed otherwise
|
||||
- prefer direct commands
|
||||
- work on ONE task at a time unless explicitly instructed otherwise:
|
||||
- at the start of work, state the task id you are executing
|
||||
- do not start work unless a task id is specified; if missing, choose the earliest unchecked task and say so
|
||||
- propose incremental steps
|
||||
@@ -22,7 +22,7 @@
|
||||
- fill in evidence with commit hash + commands run
|
||||
- never mark complete unless acceptance criteria are met
|
||||
- include date and time (HH:MM)
|
||||
|
||||
- follow this format:
|
||||
```
|
||||
* [ ] t1.1 Task Title (1)
|
||||
Description and PM notes
|
||||
|
||||
105
docs/tb.py
Normal file
105
docs/tb.py
Normal file
@@ -0,0 +1,105 @@
|
||||
import jsonlines
|
||||
import re
|
||||
from textblob import TextBlob
|
||||
from collections import Counter
|
||||
|
||||
def tprint(obj):
|
||||
print(f"{type(obj)} : {obj}")
|
||||
|
||||
|
||||
def sort_file(file):
|
||||
'''return number of positive and negative comments based on TextBlob sentiment analysis'''
|
||||
# with jsonlines.open("/vadoe/vadoe/vadoe/townhall_2021-01-14T02-05-51.json") as reader:
|
||||
with jsonlines.open(file, mode='r') as reader:
|
||||
# Confirm type
|
||||
tprint(reader)
|
||||
|
||||
# Build iterator
|
||||
_doc = iter(reader)
|
||||
i = 0
|
||||
pos = 0
|
||||
neg = 0
|
||||
posl = []
|
||||
negl = []
|
||||
|
||||
while i<25:
|
||||
_line = next(_doc)
|
||||
tprint(_line)
|
||||
if _line['sentiment'] == 'pos':
|
||||
pos = pos + 1
|
||||
posl.append(_line['comment'])
|
||||
elif _line['sentiment'] == 'neg':
|
||||
neg = neg + 1
|
||||
negl.append(_line['comment'])
|
||||
i=i+1
|
||||
|
||||
print(f'{pos} positive and {neg} negative comments')
|
||||
# tst = TextBlob(obj['comment'])
|
||||
# tst.sentiment
|
||||
|
||||
def process_file(file):
|
||||
'''Find Smythers posts'''
|
||||
with jsonlines.open(file, mode='r') as reader:
|
||||
_doc = iter(reader)
|
||||
_list = []
|
||||
for item in _doc:
|
||||
try:
|
||||
if item['author'][0] == 'Smythers':
|
||||
_list.append(item['content'][0])
|
||||
except KeyError:
|
||||
continue
|
||||
return(_list)
|
||||
|
||||
def write_file(file, data:object):
|
||||
'''Write data to file'''
|
||||
with jsonlines.open(file, mode='w') as writer:
|
||||
for each in data:
|
||||
writer.write(each)
|
||||
print('write successful')
|
||||
|
||||
def clean_text(text:str):
|
||||
s1 = remove_html(text)
|
||||
s2 = remove_http(s1)
|
||||
return s2
|
||||
|
||||
def remove_html(text:str):
|
||||
'''Remove html tags from string'''
|
||||
clean = re.compile('<.*?>')
|
||||
return re.sub(clean, '', text)
|
||||
|
||||
def remove_http(text:str):
|
||||
'''Remove URLs from string'''
|
||||
return re.sub(r'http\S+','', text)
|
||||
|
||||
def get_nouns(text:str):
|
||||
blob = TextBlob(text)
|
||||
# check nouns? or no
|
||||
return blob.tags
|
||||
|
||||
vadoe = '/vadoe/vadoe/vadoe/townhall_2021-01-14T02-05-51.json'
|
||||
vadoe_p = '/vadoe/vadoe/vadoe/townhall_2021-01-14T05-11-55.json'
|
||||
dlr = '/vadoe/vadoe/vadoe/dlr.json'
|
||||
|
||||
smythers_pc = '/vadoe/vadoe/vadoe/smythers.json'
|
||||
write_to = '/vadoe/vadoe/vadoe/nouns.json'
|
||||
|
||||
# processed_file(file)
|
||||
smythers_posts = process_file(dlr)
|
||||
# cleaned = []
|
||||
# for each in smythers:
|
||||
# cleaned.append(clean_text(each))
|
||||
cleaned = [clean_text(each) for each in smythers_posts]
|
||||
nouns = []
|
||||
for x in cleaned:
|
||||
_list = get_nouns(x)
|
||||
for y in _list:
|
||||
nouns.append(y)
|
||||
# nouns.append(x for x in [get_nouns())
|
||||
sortedNouns = Counter(nouns)
|
||||
nouns = []
|
||||
for k, v in sortedNouns.items():
|
||||
if v > 2:
|
||||
_d = (k, v)
|
||||
nouns.append(_d)
|
||||
print(nouns)
|
||||
write_file(write_to, nouns)
|
||||
45
docs/townhall.py
Normal file
45
docs/townhall.py
Normal file
@@ -0,0 +1,45 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import scrapy
|
||||
from items import CommentItem
|
||||
import textblob
|
||||
from textblob import TextBlob
|
||||
from textblob.sentiments import NaiveBayesAnalyzer
|
||||
|
||||
class TownhallSpider(scrapy.Spider):
|
||||
name = 'townhall'
|
||||
allowed_domains = ['townhall.virginia.gov']
|
||||
start_urls = ['https://www.townhall.virginia.gov/L/comments.cfm?GDocForumID=452']
|
||||
custom_settings = {
|
||||
'FEED_EXPORTERS' : {
|
||||
"jsonlines": "scrapy.exporters.JsonLinesItemExporter",
|
||||
},
|
||||
'FEED_URI' : '%(name)s_%(time)s.json',
|
||||
'FEED_FORMAT': 'jsonlines'
|
||||
}
|
||||
|
||||
def parse(self, response):
|
||||
rows = response.css('#contentwide>table>tr')
|
||||
# cut out the header row
|
||||
for each in rows[1:]:
|
||||
# for each in rows[1:6]:
|
||||
cols = each.xpath('.//td')
|
||||
linkfollow = cols[0].css('a::attr(href)').get()
|
||||
comment_title = cols[0].xpath('a/text()').get()
|
||||
# clean up
|
||||
commenter = cols[1].xpath('text()').get()
|
||||
# clean up
|
||||
date = cols[2].xpath('a/text()').get()
|
||||
print(f'{comment_title} | {commenter}')
|
||||
yield response.follow(linkfollow, callback = self.parse_comment)
|
||||
|
||||
def parse_comment(self, response):
|
||||
entry = CommentItem()
|
||||
text = response.css('.divComment>p::text').get()
|
||||
text = text.replace(u'\u00a0',' ')
|
||||
entry['comment'] = text
|
||||
blob = TextBlob(text, analyzer=NaiveBayesAnalyzer())
|
||||
entry['sentiment'] = blob.sentiment.classification
|
||||
entry['sentiment_pos'] = blob.sentiment.p_pos
|
||||
entry['sentiment_neg'] = blob.sentiment.p_neg
|
||||
# yield CommentItem(comment = response.css('.divComment>p::text').get())
|
||||
yield entry
|
||||
62
docs/townhall2.py
Normal file
62
docs/townhall2.py
Normal file
@@ -0,0 +1,62 @@
|
||||
# -*- coding: utf-8 -*-
|
||||
import scrapy
|
||||
from items import CommentItem
|
||||
import textblob
|
||||
from textblob import TextBlob
|
||||
from textblob.sentiments import NaiveBayesAnalyzer
|
||||
|
||||
class TownhallSpider(scrapy.Spider):
|
||||
name = 'townhall'
|
||||
allowed_domains = ['townhall.virginia.gov']
|
||||
start_urls = ['https://www.townhall.virginia.gov/L/Forums.cfm']
|
||||
custom_settings = {
|
||||
'FEED_EXPORTERS' : {
|
||||
"jsonlines": "scrapy.exporters.JsonLinesItemExporter",
|
||||
},
|
||||
'FEED_URI' : '%(name)s_%(time)s.json',
|
||||
'FEED_FORMAT': 'jsonlines'
|
||||
}
|
||||
|
||||
def parse(self, response):
|
||||
rows = response.css('table>tr>td')
|
||||
for each in rows:
|
||||
linkfollow = each.css('a').attrib['href']
|
||||
if 'comments' in linkfollow:
|
||||
yield response.follow(linkfollow, callback = self.parse_forum)
|
||||
|
||||
cols = each.xpath('.//td')
|
||||
linkfollow = cols[0].css('a::attr(href)').get()
|
||||
comment_title = cols[0].xpath('a/text()').get()
|
||||
# clean up
|
||||
commenter = cols[1].xpath('text()').get()
|
||||
# clean up
|
||||
date = cols[2].xpath('a/text()').get()
|
||||
print(f'{comment_title} | {commenter}')
|
||||
yield response.follow(linkfollow, callback = self.parse_comment)
|
||||
|
||||
def parse_forum(self, response):
|
||||
rows = response.css('#contentwide>table>tr')
|
||||
# cut out the header row
|
||||
for each in rows[1:]:
|
||||
# for each in rows[1:6]:
|
||||
cols = each.xpath('.//td')
|
||||
linkfollow = cols[0].css('a::attr(href)').get()
|
||||
comment_title = cols[0].xpath('a/text()').get()
|
||||
# clean up
|
||||
commenter = cols[1].xpath('text()').get()
|
||||
# clean up
|
||||
date = cols[2].xpath('a/text()').get()
|
||||
print(f'{comment_title} | {commenter}')
|
||||
yield response.follow(linkfollow, callback = self.parse_comment)
|
||||
|
||||
def parse_comment(self, response):
|
||||
entry = CommentItem()
|
||||
text = response.css('.divComment>p::text').get()
|
||||
text = text.replace(u'\u00a0',' ')
|
||||
entry['comment'] = text
|
||||
blob = TextBlob(text, analyzer=NaiveBayesAnalyzer())
|
||||
entry['sentiment'] = blob.sentiment.classification
|
||||
entry['sentiment_pos'] = blob.sentiment.p_pos
|
||||
entry['sentiment_neg'] = blob.sentiment.p_neg
|
||||
# yield CommentItem(comment = response.css('.divComment>p::text').get())
|
||||
yield entry
|
||||
Reference in New Issue
Block a user