from sklearn.feature_extraction.text import CountVectorizer
from data_analysis import df
from sklearn.preprocessing import LabelEncoder
from data_splitting import y_train, y_val
import tensorflow as tf

#Input Variable
# vectorizing input varible 'clean_text' into a matrix 
features = df['clean_text']

cv = CountVectorizer() # ngram_range=(1,2)
features = cv.fit_transform(features)

# changing the datatype of the number into uint8 to consume less memory
features = features.astype('uint8') # uint8 and float32


# defining target variable
# using LabelEncoder to get placeholder number values for categorical variabel 'language'
le = LabelEncoder()
df['language_encoded'] = le.fit_transform(df['language'])

targets = df['language_encoded']

y_train_encoded = tf.keras.utils.to_categorical(y_train, num_classes=22)
y_val_encoded = tf.keras.utils.to_categorical(y_val, num_classes=22)