hassaanik
/

Language_Identifier

Model card Files Files and versions Community

Language_Identifier / data_preparing.py

hassaanik's picture

Upload 9 files

c5c7499 verified over 1 year ago

history blame contribute delete

887 Bytes

	from sklearn.feature_extraction.text import CountVectorizer
	from data_analysis import df
	from sklearn.preprocessing import LabelEncoder
	from data_splitting import y_train, y_val
	import tensorflow as tf

	#Input Variable
	# vectorizing input varible 'clean_text' into a matrix
	features = df['clean_text']

	cv = CountVectorizer() # ngram_range=(1,2)
	features = cv.fit_transform(features)

	# changing the datatype of the number into uint8 to consume less memory
	features = features.astype('uint8') # uint8 and float32


	# defining target variable
	# using LabelEncoder to get placeholder number values for categorical variabel 'language'
	le = LabelEncoder()
	df['language_encoded'] = le.fit_transform(df['language'])

	targets = df['language_encoded']

	y_train_encoded = tf.keras.utils.to_categorical(y_train, num_classes=22)
	y_val_encoded = tf.keras.utils.to_categorical(y_val, num_classes=22)