I am working on a script that takes a sample from each category in an excel file. The script work, but my results are not as expected--I get a sample of 2 all together. I want the script to take a 1, 3, or 5% from every category, unless there are a limited number of items in the category; in which case, I want a sample of 2. I have reproduced the code below--sorry for the big block of text, I just thought it would helpful to see the whole code. Any help to fix this issue will be greatly appreciated.
#imports
import pandas as pd
#read file
df = pd.read_excel(r"C:\Users\bryanmccormack\Desktop\Hasbro.xlsx")
#check for certain condition (Y)
df2 = df.loc[(df['Track Item']=='Y')]
print(len(df2))
#unique categories and subcategories
categories = df2['Category'].unique()
subcategories = df2['Subcategory'].unique()
#check for empty subcategories
subcategory = df2['Subcategory'].isnull().all()
#taking a sample based on whether subcategory is empty and the number of y-tracked items
if subcategory == True:
def sample_per(df2):
if len(df2) >= 1500:
for category in categories:
return df2.loc[(df2["Category"] == category)].apply(lambda x: x.sample(n=2) if
x.size*0.01 < 2 else x.sample(frac=0.01))
elif len(df2) < 15000 and len(df2) > 10000:
for category in categories:
return df2.loc[(df2["Category"] == category)].apply(lambda x: x.sample(n=2) if
x.size*0.03 < 2 else x.sample(frac=0.03))
else:
for category in categories:
return df2.loc[(df2["Category"] == category)].apply(lambda x: x.sample(n=2) if
x.size*0.05 < 2 else x.sample(frac=0.05))
else:
def sample_per(df2):
if len(df2) >= 1500:
for subcategory in subcategories:
return df2.loc[(df2["Subcategory"] == subcategory)].apply(lambda x: x.sample(n=2) if
x.size*0.01 < 2 else x.sample(frac=0.01))
elif len(df2) < 15000 and len(df2) > 10000:
for subcategory in categories:
return df2.loc[(df2["Subcategory"] == subcategory)].apply(lambda x: x.sample(n=2) if
x.size*0.03 < 2 else x.sample(frac=0.03))
else:
for subcategory in subcategories:
return df2.loc[(df2["Subcategory"] == subcategory)].apply(lambda x: x.sample(n=2) if
x.size*0.05 < 2 else x.sample(frac=0.05))
#result of sample_per function
final = sample_per(df2)
The spacing looks off because the lines are long--indentation is correct
Aucun commentaire:
Enregistrer un commentaire