lundi 20 janvier 2020

beautifulsoup for-if loop extract

i want to extract data from below website using for/if loops. Below code successfully extracts data from articles with for/if loop, but I want to update it and extract company, satisfied % and overall rating data (which is always the same) also using loop.

overall=[]

satisfied=[]
company=[]

arbeitsatmosphare = []
vorgesetztenverhalten = []
kollegenzusammenhalt= []



lurl='https://www.kununu.com/de/volkswagenconsulting/kommentare'
with requests.Session() as session:
    session.headers = {
        'x-requested-with': 'XMLHttpRequest'
    }
    page = 1
    while True:
        print(f"Processing page {page}..")
        url = f'{lurl}/{page}'
        print(url)
        response = session.get(url)
        soup = BeautifulSoup(response.text, 'html.parser')

        articles = soup.find_all('article')
        print("Number of articles: " + str(len(articles)))
        for article in articles:

            for key in [{'label': 'Arbeitsatmosphäre', 'list': arbeitsatmosphare},
                        {'label': 'Vorgesetztenverhalten', 'list': vorgesetztenverhalten},
                        {'label': 'Kollegenzusammenhalt', 'list': kollegenzusammenhalt}]:
                span = article.find('span', text=re.compile(key['label']))
                #print(span)
                if span and span.find_next('span'):
                    key['list'].append(span.find_next('span').text.strip())
                else:
                    key['list'].append('N/A')



# THIS PART IS NOT WORKING

            div = soup.find(class_="company-profile-container")
            for key2 in [{'label2': 'company-name', 'list': company},
                             {'label2': 'review-recommend-value', 'list': satisfied},
                            {'label2': 'review-rating-value', 'list': overall}]:
                span2 = div.find('span', text=re.compile(key2['label2']))
                #print(span2)
                if span2 and span2.find('span'):
                    key2['list'].append(span2.find('span').text.strip())
                else:
                    key2['list'].append('N/A')
        page += 1
        pagination = soup.find_all('div', {'class': 'paginationControl'})
        if not pagination:
            break

    #print(overall)
    df = pd.DataFrame({'Arbeitsatmosphäre': arbeitsatmosphare,
                       'Vorgesetztenverhalten': vorgesetztenverhalten,
                       'Kollegenzusammenhalt': kollegenzusammenhalt,
                       'company': company,
                       'satisfied': satisfied,
                       'overall':overall
                       })

print(df)

I used above code as example, but looks like my part is not working. I cant find the issue, can you help?

Aucun commentaire:

Enregistrer un commentaire