lundi 14 septembre 2020

How to Reduce Python Code Complexity and runtime

The runtime of the code below is very huge. It takes 10 mins to process 6000 rows of data. Is there a way to reduce this huge runtime? My dataset has 200000 rows and it is not feasible to wait so long.

I tried executing it on Google Colab GPU and TPU as well. No luck. The runtime is still huge.

Please help me better understand alternatives to ifelse and the loops I used in this code and thereby reduce the runtime.

start_time= time.time()

folder= "April15"
curdir= source_dir+"//"+folder
files= sorted(os.listdir(curdir))

for file in files:
    print("extracting data from ", folder, file)
    if os.path.exists(dest_dir+r"//"+str(folder)+r"//"+file):
        print(file, ' exists')
        continue
        
    df = pd.read_csv(curdir +r"//"+ file, low_memory=False)
    n = len(df)
    print("jobs scrapped=", n)
    
    areas=[]
    titles = []
    companies = []
    ratings = []
    reviews = []
    experiences = []
    salaries = []
    locations = []
    descriptions = []
    naukri_categories = []
    posted_days = []
    tags = []

    df['footer']= df['footer'].replace('\nSave', '', regex=True)
    
    for i in range(n):
        
        head = df['header'][i].split('\n')
        if len(head)==5:
            title= head[0]
            company= head[1]
            rating= np.nan
            review= np.nan
            experience= head[2]
            salary= head[3]
            location= head[4]
        elif len(head)==7:
            title= head[0]
            company= head[1]
            rating= head[2]
            review= head[3]
            experience= head[4]
            salary= head[5]
            location= head[6]
        elif len(head)==6:
            title= head[0]
            company= head[1]
            rating= head[2]
            review= head[3]
            experience= np.nan
            salary= head[4]
            location= head[5]
        elif len(head)==4:
            title= head[0]
            company= head[1]
            rating= np.nan
            review= np.nan
            experience= np.nan
            salary= head[2]
            location= head[3]
        else:
            title= head[0]
            company= head[1]
            rating= np.nan
            review= np.nan
            experience= np.nan
            salary= head[2]
            location= np.nan
        
        job_descr = df['description'][i]

        foot= df['footer'].str.split('\n')
        if len(foot[i])==2:
            day= foot[i][1]
            category= foot[i][0]
        else:
            day= foot[i][0]
            category= np.nan

        areas.append(str(file[:-9]))
        titles.append(title)
        companies.append(company)
        ratings.append(rating)
        reviews.append(review)
        experiences.append(experience)
        salaries.append(salary)
        locations.append(location)
        descriptions.append(job_descr)
        posted_days.append(day)
        naukri_categories.append(category)

        if i%10000 == 0 and i != 0:
            print('\t', i, n)

    columns= ['functional area', 'title', 'company', 'experience','salary', 'location', 'description', 'naukri categories', 'posted_days', 'scraped_on', 'rating', 'review' ]
    df_new = pd.DataFrame(list(zip(areas, titles, companies, experiences, salaries, locations, descriptions, naukri_categories, posted_days, df['scraped_time'], ratings, reviews  )), columns =columns) 
    if len(tags)!=0:
        df_new['tags']= tags
    print('Extraction completed, writing to destination ', file)
    df_new.to_csv(dest_dir+r"//"+str(folder)+r"//"+file, index=False)
    
    print("time_taken for file=", (time.time()-start_time)/60, " minutes")
print("time_taken for folder=", (time.time()-start_time)/60, " minutes")strong text

Aucun commentaire:

Enregistrer un commentaire