yassiracharki commited on
Commit
d982179
1 Parent(s): fdede4d

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +103 -3
README.md CHANGED
@@ -1,3 +1,103 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ metrics:
4
+ - accuracy
5
+ pipeline_tag: text-classification
6
+ tags:
7
+ - CNN
8
+ - NLP
9
+ - Yelp
10
+ - Reviews
11
+ - pre_trained
12
+ ---
13
+ # Model Card for Model ID
14
+
15
+ # Downloads
16
+ !pip install contractions
17
+ !pip install textsearch
18
+ !pip install tqdm
19
+
20
+ import nltk
21
+ nltk.download('punkt')
22
+
23
+ # Fundamental classes
24
+ import tensorflow as tf
25
+ from tensorflow import keras
26
+ import pandas as pd
27
+ import numpy as np
28
+
29
+ # Time
30
+ import time
31
+ import datetime
32
+
33
+ # Preprocessing
34
+ from tensorflow.keras.preprocessing.text import Tokenizer
35
+ from tensorflow.keras.preprocessing import sequence
36
+ from sklearn.preprocessing import LabelEncoder
37
+ import contractions
38
+ from bs4 import BeautifulSoup
39
+ import re
40
+ import tqdm
41
+ import unicodedata
42
+
43
+ seed = 3541
44
+ np.random.seed(seed)
45
+
46
+ # Define a dummy loss to bypass the error during model loading
47
+ def dummy_loss(y_true, y_pred):
48
+ return tf.reduce_mean(y_pred - y_true)
49
+
50
+ # Loading the model Trained on Yelp reviews
51
+ modelYelp = keras.models.load_model(
52
+ '/kaggle/input/pre-trained-model-binary-cnn-nlp-yelpreviews/tensorflow1/pre-trained-model-binary-cnn-nlp-yelp-reviews/1/Binary_Classification_90_Yelp_Reviews_CNN.h5',
53
+ compile=False
54
+ )
55
+
56
+ # Compile the model with the correct loss function and reduction
57
+ modelYelp.compile(
58
+ optimizer='adam',
59
+ loss=keras.losses.BinaryCrossentropy(reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE),
60
+ metrics=['accuracy']
61
+ )
62
+
63
+ # Loading Yelp test data
64
+ dataset_test_Yelp = pd.read_csv('/kaggle/input/yelp-reviews-for-sentianalysis-binary-np-csv/yelp_review_sa_binary_csv/test.csv')
65
+
66
+ # Loading Yelp train data (to be used on the label encoder)
67
+ dataset_train_Yelp = pd.read_csv('/kaggle/input/yelp-reviews-for-sentianalysis-binary-np-csv/yelp_review_sa_binary_csv/train.csv')
68
+
69
+ # Shuffling the Test Data
70
+ test_Yelp = dataset_test_Yelp.sample(frac=1)
71
+ train_Yelp = dataset_train_Yelp.sample(frac=1)
72
+
73
+ # Taking a tiny portion of the database (because it will only be used on the label encoder)
74
+ train_Yelp = dataset_train_Yelp.iloc[:100, :]
75
+
76
+ # Taking only necessary columns
77
+ y_test_Yelp = test_Yelp['class_index'].values
78
+ X_train_Yelp = train_Yelp['review_text'].values
79
+ y_train_Yelp = train_Yelp['class_index'].values
80
+
81
+ # Preprocess corpus function
82
+ def pre_process_corpus(corpus):
83
+ processed_corpus = []
84
+ for doc in tqdm.tqdm(corpus):
85
+ doc = contractions.fix(doc)
86
+ doc = BeautifulSoup(doc, "html.parser").get_text()
87
+ doc = unicodedata.normalize('NFKD', doc).encode('ascii', 'ignore').decode('utf-8', 'ignore')
88
+ doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
89
+ doc = doc.lower()
90
+ doc = doc.strip()
91
+ processed_corpus.append(doc)
92
+ return processed_corpus
93
+
94
+ # Preprocessing the Data
95
+ X_test_Yelp = pre_process_corpus(test_Yelp['review_text'].values)
96
+ X_train_Yelp = pre_process_corpus(X_train_Yelp)
97
+
98
+ # Creating and Fitting the Tokenizer
99
+ etc ...
100
+
101
+ # More info on the Model page on Kaggle :
102
+
103
+ https://www.kaggle.com/models/yacharki/pre-trained-model-binary-cnn-nlp-yelpreviews