Dataset / preprocessing /process.py
vansh9878's picture
files added
825e978
raw
history blame contribute delete
1.63 kB
import pandas as pd
import getString
import getNLP
import os
def one_hot_encode_objects(df,nlp,columns):
object_cols = df.select_dtypes(include='object').columns
for col in object_cols:
if col not in nlp and col not in columns and "label" not in col:
if df[col].apply(lambda x: isinstance(x, (list, tuple, dict, set)) or hasattr(x, '__array__')).any():
print(f"Skipping column '{col}' due to unhashable values.")
continue
dummies = pd.get_dummies(df[col], prefix=col).astype(int)
df = pd.concat([df, dummies], axis=1)
df = df.drop(columns=[col])
print(df.columns)
return df
def fixEmpty(df):
df.replace(['undefined', 'null', 'NaN', 'None'], pd.NA, inplace=True)
for col in df.columns:
if df[col].dtype == 'object':
df[col] = df[col].fillna('Unknown')
else:
df[col] = df[col].fillna(df[col].mean())
return df
def preprocessing(query):
os.makedirs("./processed",exist_ok=True)
df=pd.read_csv("final/"+query+".csv")
# print(df.head())
df=fixEmpty(df)
preDict,col,nlp=getString.getCodes(query)
if len(col)>0:
for new_col, expr in preDict.items():
df[new_col] = eval(expr)
df.drop(columns=col, inplace=True)
if len(nlp)>0:
df=getNLP.wordEmbed(df,nlp)
# print(df.columns)
df=one_hot_encode_objects(df,nlp,col)
# df = df.astype('float32')
df.to_csv("./processed/"+query+".csv", index=False)
# print(df.head())
# print(df.info())
# preprocessing("twitter sentiment analysis")