hassaanik commited on
Commit
24bf069
1 Parent(s): fc81358

Upload 8 files

Browse files
Files changed (8) hide show
  1. app.py +24 -0
  2. data_analysis.py +15 -0
  3. data_cleaning.py +61 -0
  4. data_preparing.py +0 -0
  5. data_splitting.py +0 -0
  6. model.py +0 -0
  7. model_callbacks.py +13 -0
  8. predict.py +10 -0
app.py ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from flask import Flask, request, render_template, jsonify
2
+ from predict import predict_language
3
+ import joblib
4
+ import tensorflow as tf
5
+ import h5py
6
+
7
+ model = tf.keras.models.load_model('models\\full_language_identifcation_modelf.h5')
8
+ model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
9
+ CountVectorizer = joblib.load('models\\cv.joblib')
10
+ LabelEncoder = joblib.load('models\\le.joblib')
11
+
12
+
13
+ app = Flask(__name__)
14
+
15
+ @app.route('/', methods=['GET', 'POST'])
16
+ def predict():
17
+ if request.method == 'POST':
18
+ text = request.form['text']
19
+ prediction = predict_language(text, model, CountVectorizer, LabelEncoder) # Call your prediction function
20
+ return render_template('result.html', prediction=prediction, text=text)
21
+ return render_template('index.html')
22
+
23
+ if __name__ == '__main__':
24
+ app.run(debug=True)
data_analysis.py ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+
3
+ df = pd.read_csv('data\\dataset.csv')
4
+
5
+ # df.head()
6
+
7
+ # df.info()
8
+
9
+ # df.isnull().sum()
10
+
11
+ # df.language.value_counts()
12
+
13
+ # df.text[0]
14
+
15
+ # df.language[0]
data_cleaning.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from data_analysis import df
2
+ from nltk.tokenize import word_tokenize
3
+ import re
4
+ import pandas as pd
5
+ import nltk
6
+
7
+ #Removing Duplicates
8
+ # df = df.drop_duplicates(subset='Text')
9
+ # df = df.reset_index(drop=True)
10
+
11
+ nltk.download('punkt')
12
+ # Initialize the set of non-alphanumeric characters to remove
13
+ nonalphanumeric = ['\'', '.', ',', '\"', ':', ';', '!', '@', '#', '$', '%', '^', '&',
14
+ '*', '(', ')', '-', '_', '+', '=', '[', ']', '{', '}', '\\', '?',
15
+ '/', '>', '<', '|', ' ']
16
+
17
+ def clean_text(text):
18
+ """
19
+ Function to clean and preprocess text data.
20
+ """
21
+ # Tokenize the text using spaCy
22
+ tokens = word_tokenize(text)
23
+
24
+ # Remove non-alphanumeric characters
25
+ words = [word.lower() for word in tokens if word not in nonalphanumeric]
26
+
27
+ # Join the lemmatized words back into a single string
28
+ cleaned_text = " ".join(words)
29
+
30
+ return cleaned_text
31
+
32
+ def remove_english(text):
33
+ """
34
+ function that takes text as input and returns text without english words
35
+ """
36
+ pat = "[a-zA-Z]+"
37
+ text = re.sub(pat, "", text)
38
+ return text
39
+
40
+
41
+ #applying clean_text function to all rows in 'Text' column
42
+ # df['clean_text'] = df['Text'].apply(clean_text)
43
+
44
+
45
+
46
+ # #Removing English from Chinese text
47
+ # df_Chinese = df[df['language']=='Chinese'] # Chinese data in dataset
48
+
49
+ # clean_text = df.loc[df.language=='Chinese']['clean_text']
50
+ # clean_text = clean_text.apply(remove_english) # removing English words
51
+ # df_Chinese.loc[:,'clean_text'] = clean_text
52
+
53
+ # # Concatenate the original DataFrame with the cleaned Chinese text DataFrame
54
+ # df = pd.concat([df, df_Chinese], axis=0, ignore_index=True)
55
+
56
+ # # Drop rows with 'Chinese' language from the original DataFrame
57
+ # df = df[~df['language'].isin(['Chinese'])].reset_index(drop=True)
58
+
59
+
60
+ # # shuffling dataframe and resetting index
61
+ # df = df.sample(frac=1).reset_index(drop=True)
data_preparing.py ADDED
File without changes
data_splitting.py ADDED
File without changes
model.py ADDED
File without changes
model_callbacks.py ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import tensorflow as tf
2
+
3
+
4
+ def lr_scheduler(epoch, lr):
5
+ if epoch < 3:
6
+ return lr
7
+ else:
8
+ return lr * tf.math.exp(-0.1)
9
+
10
+
11
+ early_stopping = tf.keras.callbacksEarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)
12
+ lr_scheduler_callback = tf.keras.callbacksLearningRateScheduler(lr_scheduler)
13
+ optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
predict.py ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ from data_cleaning import clean_text
2
+ import numpy as np
3
+
4
+
5
+ def predict_language(text, model, cv, le):
6
+ cleaned_text = clean_text(text)
7
+ text_vectorized = cv.transform([cleaned_text])
8
+ prediction = model.predict(text_vectorized)
9
+ predicted_label = le.inverse_transform([np.argmax(prediction)])[0] # Get the first element of the list
10
+ return predicted_label