I am currently working on a Web Scraping project to scrape Data from a Newsletter Forum. For this i need to show if the comment was written by a staff member or by a reader/ customer. If the comment was written by a staff member i want to write "Admin", if not "Customer".
The comment was written bei a staff member if the span
element contains username--staff
. I already tried to use a if-loop in the items file but this did not work.
As you may have noticed i am pretty new to this stuff, so please forgive me if this was an incredible dumb question or if I did something wrong. I would be really thankful if someone could help me. :)
Here is my code:
Spider:
class ComputerBaseSpider(scrapy.Spider):
name = "ComputerBase"
start_urls = [
'https://www.computerbase.de/forum/threads/in-eigener-sache-auch-wir-kommen-an-einem-consent-dialog-nicht-vorbei.1973328/',
]
def parse(self, response):
comments = response.xpath( '//*/div[@class = "message-cell message-cell--main"]')
for comment in comments:
loader = ItemLoader(item = CBCrawlerItem(), selector = comment)
loader.add_xpath('Comment_no','.//ul[@class="message-attribution-opposite message-attribution-opposite--list "]/li[2]/a/text()')
loader.add_xpath('Datetime','.//time[@class="u-dt"]/@datetime')
loader.add_xpath('Comment', './/*/article[@class = "message-body js-selectToQuote"]//div[@class = "bbWrapper"]//text()[not(ancestor::*[@class="js-extraPhrases"])]')
loader.add_xpath('Admin', '..//span[contains(@class, "username--staff")]')
yield loader.load_item()
next_page = response.xpath('//*/a[@class="pageNav-jump pageNav-jump--next"]/@href').extract_first()
if next_page is not None:
yield response.follow(next_page, callback=self.parse)
items file:
def clean_data(data):
data = data.strip()
return data
def remove_quotes(text):
#strip the unicode quotes
text = text.strip(u'\u201c'u'\u201d')
return text
def user_check(user):
if response.xpath( '//*/div[@class = "message-cell message-cell--main"]..//span[contains(@class, "username--staff")]') is true:
return "Admin"
else:
return "Customer"
class CBCrawlerItem(scrapy.Item):
# define the fields for your item here like:
Comment_no = scrapy.Field(
input_processor=MapCompose(clean_data),
output_processor=TakeFirst()
)
Datetime = scrapy.Field(
input_processor=MapCompose(),
output_processor=TakeFirst()
)
Comment = scrapy.Field(
input_processor=MapCompose(clean_data, remove_quotes),
output_processor=Identity()
)
Admin = scrapy.Field(
input_processor=MapCompose(user_check),
output_processor=TakeFirst()
)
pass
This was my try:
loader.add_xpath('Admin', '..//span[contains(@class, "username--staff")]')
def user_check(user):
if response.xpath( '//*/div[@class = "message-cell message-cell--main"]..//span[contains(@class, "username--staff")]') is true:
return "Admin"
else:
return "Customer"
Aucun commentaire:
Enregistrer un commentaire