|
import pandas as pd |
|
import getString |
|
import getNLP |
|
import os |
|
|
|
def one_hot_encode_objects(df,nlp,columns): |
|
object_cols = df.select_dtypes(include='object').columns |
|
for col in object_cols: |
|
if col not in nlp and col not in columns and "label" not in col: |
|
if df[col].apply(lambda x: isinstance(x, (list, tuple, dict, set)) or hasattr(x, '__array__')).any(): |
|
print(f"Skipping column '{col}' due to unhashable values.") |
|
continue |
|
dummies = pd.get_dummies(df[col], prefix=col).astype(int) |
|
df = pd.concat([df, dummies], axis=1) |
|
df = df.drop(columns=[col]) |
|
print(df.columns) |
|
return df |
|
|
|
def fixEmpty(df): |
|
df.replace(['undefined', 'null', 'NaN', 'None'], pd.NA, inplace=True) |
|
for col in df.columns: |
|
if df[col].dtype == 'object': |
|
df[col] = df[col].fillna('Unknown') |
|
else: |
|
df[col] = df[col].fillna(df[col].mean()) |
|
|
|
return df |
|
|
|
def preprocessing(query): |
|
os.makedirs("./processed",exist_ok=True) |
|
df=pd.read_csv("final/"+query+".csv") |
|
|
|
df=fixEmpty(df) |
|
preDict,col,nlp=getString.getCodes(query) |
|
if len(col)>0: |
|
for new_col, expr in preDict.items(): |
|
df[new_col] = eval(expr) |
|
df.drop(columns=col, inplace=True) |
|
if len(nlp)>0: |
|
df=getNLP.wordEmbed(df,nlp) |
|
|
|
df=one_hot_encode_objects(df,nlp,col) |
|
|
|
df.to_csv("./processed/"+query+".csv", index=False) |
|
|
|
|
|
|
|
|