jeudi 28 juillet 2016

Code not executing after if statement inside of a for loop

So. I have a web scraper that will download pdf sources based on the name of the "a" tag given. If there is a match in the link title. I want to download the pdf source and save it to the directory specified... However, not ALL of the links get downloaded... It will print the "File Type", then skip my entire for loop and move on to the next link... HELP!

numb = 0
# Go through each link found in the "Virtual Web Folder"
for item in linkList:

    print(item)
    # Open up the link, locate and extract the block and lot numbers, as
    # well as the type of report found in the webpage
    html = urlopen(item)
    time.sleep(2)
    soup = BeautifulSoup(html.read(), "html5lib")
    mainInfo = soup.find_all("td",{"class":"maininfo"})
    try:
        reportType = mainInfo[0].text
    except:
        pass
    reportType = reportType.strip()
    try:
        location = mainInfo[4].text.split()
    except:
        pass
    binBlock = location[3]+"-"+location[5]


    aName = reportType.upper()


    print("Determining Report Type...")
    time.sleep(1)
    # "Soil Report titles can be upper or lower case.. So to make sure they all
    # match. Force the upper case and look for a match
    if reportType.upper() == "SOIL REPORT" or reportType == "TECHNICAL REPORT: SOIL INSPECTION":

        if aName == "SOIL REPORT":
            name = "SR"
            print("File Type: " + name)
            pass

        else:
            name = "TR"
            print("File Type: " + name)
            pass

        pass

        # If a match is found. Find the source in the html code that contains
        # the pdf
        for link in soup.select("iframe[src]"):

            print("Downloading...")
            time.sleep(1)
            #Extrace the source and make the link useable
            extractLink = startUrl+link["src"]
            #Check if path is valid.. If not, create a new folder
            firstPath = "/Volumes/Westergaard/BRONX/"+binBlock+"/"
            if not os.path.exists(firstPath):
                os.makedirs(firstPath)

            path = firstPath + name + str(numb) + ".pdf"
            #Download the file and save to correct path
            pdfFile = urllib.request.urlretrieve(extractLink, path)
            time.sleep(2)
            print("Finished"+"\n")
            numb = numb + 1
            continue
        else:

        binBlock = str(accum)
        name = "NA"

        for link in soup.select("iframe[src]"):

            print("Downloading Uknown...")
            time.sleep(1)
            extractLink = startUrl+link["src"]
            #Now... download this link
            firstPath = "/Volumes/Westergaard/BRONX/"
            path = firstPath + binBlock + name + "(1).pdf"
            pdfFile = urllib.request.urlretrieve(extractLink, path)
            time.sleep(2)
            print("Finished"+"\n")
            continue

Aucun commentaire:

Enregistrer un commentaire