Spider fetches ViewComments.cfm?GdocForumID=N with vPerPage=500, generates all page requests from page-1 metadata, and parses each div.Cbox for comment_id, author, date, title, text, reg_title, reg_desc. Handles span-wrapped comment text. Fixes UTF-8/windows-1251 meta-tag encoding mismatch. 9083 items, 15 empty-text (0.17%). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
18 lines
384 B
Python
18 lines
384 B
Python
import scrapy
|
|
|
|
|
|
class CommentItem(scrapy.Item):
|
|
# Forum / regulation context
|
|
forum_id = scrapy.Field()
|
|
reg_title = scrapy.Field()
|
|
reg_desc = scrapy.Field()
|
|
|
|
# Comment metadata
|
|
comment_id = scrapy.Field()
|
|
author = scrapy.Field()
|
|
date = scrapy.Field()
|
|
title = scrapy.Field()
|
|
|
|
# Comment content
|
|
text = scrapy.Field()
|