So. I have a web scraper that will download pdf sources based on the name of the "a" tag given. If there is a match in the link title. I want to download the pdf source and save it to the directory specified... However, not ALL of the links get downloaded... It will print the "File Type", then skip my entire for loop and move on to the next link... HELP!
numb = 0
# Go through each link found in the "Virtual Web Folder"
for item in linkList:
print(item)
# Open up the link, locate and extract the block and lot numbers, as
# well as the type of report found in the webpage
html = urlopen(item)
time.sleep(2)
soup = BeautifulSoup(html.read(), "html5lib")
mainInfo = soup.find_all("td",{"class":"maininfo"})
try:
reportType = mainInfo[0].text
except:
pass
reportType = reportType.strip()
try:
location = mainInfo[4].text.split()
except:
pass
binBlock = location[3]+"-"+location[5]
aName = reportType.upper()
print("Determining Report Type...")
time.sleep(1)
# "Soil Report titles can be upper or lower case.. So to make sure they all
# match. Force the upper case and look for a match
if reportType.upper() == "SOIL REPORT" or reportType == "TECHNICAL REPORT: SOIL INSPECTION":
if aName == "SOIL REPORT":
name = "SR"
print("File Type: " + name)
pass
else:
name = "TR"
print("File Type: " + name)
pass
pass
# If a match is found. Find the source in the html code that contains
# the pdf
for link in soup.select("iframe[src]"):
print("Downloading...")
time.sleep(1)
#Extrace the source and make the link useable
extractLink = startUrl+link["src"]
#Check if path is valid.. If not, create a new folder
firstPath = "/Volumes/Westergaard/BRONX/"+binBlock+"/"
if not os.path.exists(firstPath):
os.makedirs(firstPath)
path = firstPath + name + str(numb) + ".pdf"
#Download the file and save to correct path
pdfFile = urllib.request.urlretrieve(extractLink, path)
time.sleep(2)
print("Finished"+"\n")
numb = numb + 1
continue
else:
binBlock = str(accum)
name = "NA"
for link in soup.select("iframe[src]"):
print("Downloading Uknown...")
time.sleep(1)
extractLink = startUrl+link["src"]
#Now... download this link
firstPath = "/Volumes/Westergaard/BRONX/"
path = firstPath + binBlock + name + "(1).pdf"
pdfFile = urllib.request.urlretrieve(extractLink, path)
time.sleep(2)
print("Finished"+"\n")
continue
Aucun commentaire:
Enregistrer un commentaire