Language_Identifier / data_preparing.py
hassaanik's picture
Upload 9 files
c5c7499 verified
from sklearn.feature_extraction.text import CountVectorizer
from data_analysis import df
from sklearn.preprocessing import LabelEncoder
from data_splitting import y_train, y_val
import tensorflow as tf
#Input Variable
# vectorizing input varible 'clean_text' into a matrix
features = df['clean_text']
cv = CountVectorizer() # ngram_range=(1,2)
features = cv.fit_transform(features)
# changing the datatype of the number into uint8 to consume less memory
features = features.astype('uint8') # uint8 and float32
# defining target variable
# using LabelEncoder to get placeholder number values for categorical variabel 'language'
le = LabelEncoder()
df['language_encoded'] = le.fit_transform(df['language'])
targets = df['language_encoded']
y_train_encoded = tf.keras.utils.to_categorical(y_train, num_classes=22)
y_val_encoded = tf.keras.utils.to_categorical(y_val, num_classes=22)