The runtime of the code below is very huge. It takes 10 mins to process 6000 rows of data. Is there a way to reduce this huge runtime? My dataset has 200000 rows and it is not feasible to wait so long.
I tried executing it on Google Colab GPU and TPU as well. No luck. The runtime is still huge.
Please help me better understand alternatives to ifelse and the loops I used in this code and thereby reduce the runtime.
start_time= time.time()
folder= "April15"
curdir= source_dir+"//"+folder
files= sorted(os.listdir(curdir))
for file in files:
print("extracting data from ", folder, file)
if os.path.exists(dest_dir+r"//"+str(folder)+r"//"+file):
print(file, ' exists')
continue
df = pd.read_csv(curdir +r"//"+ file, low_memory=False)
n = len(df)
print("jobs scrapped=", n)
areas=[]
titles = []
companies = []
ratings = []
reviews = []
experiences = []
salaries = []
locations = []
descriptions = []
naukri_categories = []
posted_days = []
tags = []
df['footer']= df['footer'].replace('\nSave', '', regex=True)
for i in range(n):
head = df['header'][i].split('\n')
if len(head)==5:
title= head[0]
company= head[1]
rating= np.nan
review= np.nan
experience= head[2]
salary= head[3]
location= head[4]
elif len(head)==7:
title= head[0]
company= head[1]
rating= head[2]
review= head[3]
experience= head[4]
salary= head[5]
location= head[6]
elif len(head)==6:
title= head[0]
company= head[1]
rating= head[2]
review= head[3]
experience= np.nan
salary= head[4]
location= head[5]
elif len(head)==4:
title= head[0]
company= head[1]
rating= np.nan
review= np.nan
experience= np.nan
salary= head[2]
location= head[3]
else:
title= head[0]
company= head[1]
rating= np.nan
review= np.nan
experience= np.nan
salary= head[2]
location= np.nan
job_descr = df['description'][i]
foot= df['footer'].str.split('\n')
if len(foot[i])==2:
day= foot[i][1]
category= foot[i][0]
else:
day= foot[i][0]
category= np.nan
areas.append(str(file[:-9]))
titles.append(title)
companies.append(company)
ratings.append(rating)
reviews.append(review)
experiences.append(experience)
salaries.append(salary)
locations.append(location)
descriptions.append(job_descr)
posted_days.append(day)
naukri_categories.append(category)
if i%10000 == 0 and i != 0:
print('\t', i, n)
columns= ['functional area', 'title', 'company', 'experience','salary', 'location', 'description', 'naukri categories', 'posted_days', 'scraped_on', 'rating', 'review' ]
df_new = pd.DataFrame(list(zip(areas, titles, companies, experiences, salaries, locations, descriptions, naukri_categories, posted_days, df['scraped_time'], ratings, reviews )), columns =columns)
if len(tags)!=0:
df_new['tags']= tags
print('Extraction completed, writing to destination ', file)
df_new.to_csv(dest_dir+r"//"+str(folder)+r"//"+file, index=False)
print("time_taken for file=", (time.time()-start_time)/60, " minutes")
print("time_taken for folder=", (time.time()-start_time)/60, " minutes")strong text
Aucun commentaire:
Enregistrer un commentaire