i have an if statement that checks whether if it is of type "list". if so, do not put it in a dictionary as a type list but put it raw since it already is a list. (saved to a dictionary)
for the other elements assign the element as it should (saved to a dictionary)
However, the if statement never seems to execute
please see the code section mentioned below,
print(type(contentAggregator[8]))
print(type(contentAggregator[9]))
print(type(contentAggregator[10]))
for i, k in enumerate(contentDict):
if type(contentAggregator[i]) == isinstance(contentAggregator[i],list):
contentDict[k] = contentAggregator[i]
else:
contentDict[k] = [contentAggregator[i]]
print(contentAggregator)
print(contentDict)
The code above does the check, i have verified the elements are of type 'list' but for some reason my if statment never seems to execute. real tricky one...
# This Python file uses the following encoding: utf-8
import pymysql
from sqlalchemy import create_engine
pymysql.install_as_MySQLdb()
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium import webdriver as wd
from Stamprally import StamprallyInfo
import re
import pandas as pd
import numpy as np
import urllib.request
import math
import time
import sys
import os
import MySQLdb
programStart = time.time()
prefectureNameList = ["北海道", "青森県", "岩手県", "宮城県", "秋田県", "山形県", "福島県", "茨城県", "栃木県", "群馬県", "埼玉県", "千葉県", "東京都", "神奈川県", "新潟県", "富山県", "石川県", "福井県", "山梨県", "長野県", "岐阜県", "静岡県",
"愛知県", "三重県", "滋賀県", "京都府", "大阪府", "兵庫県", "奈良県", "和歌山県", "鳥取県", "島根県", "岡山県", "広島県", "山口県", "徳島県", "香川県", "愛媛県", "高知県", "福岡県", "佐賀県", "長崎県", "熊本県", "大分県", "宮崎県", "鹿児島県", "沖縄県"]
data = []
timeStampData = []
contentAggregator = []
timeStampData.append("프로그램 시작")
timeStampData.append(programStart)
main_url = 'https://stamprally.org/'
programEnd = time.time()
timeStampData.append(programEnd - programStart)
timeStamp = pd.DataFrame(np.array([timeStampData]), columns=[
'설명', 'TimeStamp', '소요기간'])
timeStampData.clear()
timeStampData.append("셀레니엄 드라이버 로딩")
seleniumStart = time.time()
timeStampData.append(seleniumStart)
driver = wd.Chrome(executable_path='chromedriver.exe')
driver.get(main_url)
seleniumEnd = time.time()
timeStampData.append(seleniumEnd - seleniumStart)
rowAddTimeStampSelenium = pd.Series(timeStampData, index=timeStamp.columns)
timeStamp = timeStamp.append(rowAddTimeStampSelenium, ignore_index=True)
timeStampData.clear()
prefectureValueStorage = [x.get_attribute('value') for x in driver.find_elements_by_xpath(
"//select[@name='search_cat1']/option[@class='level-1']")]
prefectureNameIterator = 0
for prefectureValue in prefectureValueStorage:
prefectureStart = time.time()
timeStampData.append(prefectureNameList[prefectureNameIterator])
timeStampData.append(prefectureStart)
driver.get(
f"https://stamprally.org/?search_keywords&search_keywords_operator=and&search_cat1={prefectureValue}&search_cat2=0")
imageDownloadCounter = 1
totalList = driver.find_element_by_css_selector(
'div.page_navi2.clearfix>p').text
totalListNum = totalList.split("件中")
if int(totalListNum[0]) % 10 != 0:
pageLoopCount = math.ceil((int(totalListNum[0])/10))
else:
pageLoopCount = int(totalListNum[0])/10
currentpage = 0
while currentpage < pageLoopCount:
currentpage += 1
driver.get(
f"https://stamprally.org/?search_keywords&search_keywords_operator=and&search_cat1={prefectureValue}&search_cat2=0&paged={currentpage}")
urlList = []
currentUrlCounter = 0
listURLContainer = driver.find_elements_by_css_selector(
'#post_list2 > li > a')
for url in listURLContainer:
urlList.append(url.get_attribute('href'))
for listURL in listURLContainer:
contentAggregator = []
contentAggregator.append(int(totalListNum[0]))
contentAggregator.append(
prefectureNameList[prefectureNameIterator])
contentAggregator.append(
urlList[currentUrlCounter])
driver.get(urlList[currentUrlCounter])
currentUrlCounter += 1
locationTag = [x.get_attribute('title') for x in driver.find_elements_by_xpath(
"//*[@id='post_meta_top']/li[1]/a[@class='cat-category']")]
contentAggregator.append(locationTag)
eventTag = [x.get_attribute('title') for x in driver.find_elements_by_xpath(
"//*[@id='post_meta_top']/li[2]/a[@class='cat-category2']")]
contentAggregator.append(eventTag)
availablePeriod = (driver.find_element_by_css_selector(
'div#post_date')).text.split("( ")
availablePeriodFormatted = availablePeriod[0].replace("開催期間:", "")
availableStartDate = availablePeriod[0].split(" ~ ")
endDate = availableStartDate[1]
availableStartDateFormatted = availableStartDate[0].replace(
"開催期間:", "")
lastUpdatedDate = driver.find_element_by_css_selector(
'time.entry-date.updated').text
contentAggregator.append(availablePeriodFormatted)
contentAggregator.append(availableStartDateFormatted)
contentAggregator.append(endDate)
contentAggregator.append(lastUpdatedDate[6:])
mainImageUrl = driver.find_element_by_css_selector(
'img.attachment-post-thumbnail.size-post-thumbnail.wp-post-image').get_attribute('src')
contentAggregator.append(mainImageUrl)
postTitle1 = driver.find_element_by_css_selector(
'h2#post_title').text.replace("開催終了", "")
postTitle = postTitle1.replace("ただいま開催中", "")
removeSpecialChars = postTitle.translate(
{ord(c): " " for c in "!@#$%^&*()[]{};:,./<>?\|`~-=_+"})
postTitle = removeSpecialChars
contentAggregator.append(postTitle)
eventValidity = driver.find_element_by_xpath(
"//*[@id='post_title']/span").text
contentAggregator.append(eventValidity)
urllib.request.urlretrieve(mainImageUrl, (str(
prefectureNameList[prefectureNameIterator])+postTitle+str(imageDownloadCounter) + ".png"))
imageDownloadCounter += 1
prefectureNameIterator += 1
innerWebSiteButtonURL = driver.find_element_by_css_selector(
'div.post_content.clearfix > div >a').get_attribute('href')
contentAggregator.append(innerWebSiteButtonURL)
mainText = driver.find_elements_by_css_selector(
'div.post_content.clearfix > p')
mainContentText = []
for mainContentDetail in mainText:
mainContentText.append(mainContentDetail.text)
mainContextTextCount = len(mainContentText)-1
contentAggregator.append(mainContentText[:mainContextTextCount])
contentReorder = [1, 0, 10, 11, 5, 6, 7, 8, 13, 3, 4, 9, 12, 2]
contentAggregator = [contentAggregator[i] for i in contentReorder]
data = data.append(contentAggregator)
df = pd.DataFrame(data, columns=["Prefecture", "Total List Number", "Title", "Event Validity", "Available Period", "Available StartDate",
"End Date", "Last Updated", "mainText", "Location Tag", "Event Tag", "Main Image URL", "innerWebSiteURL", "ListLink"])
contentDict = {
"Prefecture": "",
"Total List Number": "",
"Title": "",
"Event Validity": "",
"Available Period": "",
"Available StartDate": "",
"End Date": "",
"Last Updated": "",
"mainText": "",
"Location Tag": "",
"Event Tag": "",
"Main Image URL": "",
"innerWebSiteURL": "",
"ListLink": ""
}
print(type(contentAggregator[8]))
print(type(contentAggregator[9]))
print(type(contentAggregator[10]))
for i, k in enumerate(contentDict):
if type(contentAggregator[i]) == isinstance(contentAggregator[i],list):
contentDict[k] = contentAggregator[i]
else:
contentDict[k] = [contentAggregator[i]]
print(contentAggregator)
print(contentDict)
engine = create_engine("mysql+mysqldb://root:abcdefgH1@localhost/stamprallydb", encoding='utf-8')
df2 = pd.DataFrame(data=contentDict)
#df2=df.transpose()
conn = engine.connect()
df2.to_sql(name='stamprallydb_crawl_result',
con=engine, if_exists='append', index=True)
else:
prefectureEnd = time.time()
timeStampData.append(prefectureEnd-prefectureStart)
rowAddTimeStampPrefecture = pd.Series(
timeStampData, index=timeStamp.columns)
timeStamp = timeStamp.append(
rowAddTimeStampPrefecture, ignore_index=True)
timeStampData.clear()
excelFileStart = time.time()
xlwriter = pd.ExcelWriter('StampRally_Crawler.xlsx')
df.to_excel(xlwriter, sheet_name="Stamprally.org Crawl Result")
excelFileEnd = time.time()
timeStampData.append("엑셀 파일 저장")
timeStampData.append(excelFileStart)
timeStampData.append(excelFileEnd-excelFileStart)
rowAddTimeStampPrefecture = pd.Series(timeStampData, index=timeStamp.columns)
timeStamp = timeStamp.append(rowAddTimeStampPrefecture, ignore_index=True)
timeStamp.to_excel(xlwriter, sheet_name="TimeStamp Result")
xlwriter.close()
driver.close()
driver.quit()
sys.exit()
Aucun commentaire:
Enregistrer un commentaire