yassiracharki
commited on
Commit
•
d982179
1
Parent(s):
fdede4d
Update README.md
Browse files
README.md
CHANGED
@@ -1,3 +1,103 @@
|
|
1 |
-
---
|
2 |
-
license: apache-2.0
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
license: apache-2.0
|
3 |
+
metrics:
|
4 |
+
- accuracy
|
5 |
+
pipeline_tag: text-classification
|
6 |
+
tags:
|
7 |
+
- CNN
|
8 |
+
- NLP
|
9 |
+
- Yelp
|
10 |
+
- Reviews
|
11 |
+
- pre_trained
|
12 |
+
---
|
13 |
+
# Model Card for Model ID
|
14 |
+
|
15 |
+
# Downloads
|
16 |
+
!pip install contractions
|
17 |
+
!pip install textsearch
|
18 |
+
!pip install tqdm
|
19 |
+
|
20 |
+
import nltk
|
21 |
+
nltk.download('punkt')
|
22 |
+
|
23 |
+
# Fundamental classes
|
24 |
+
import tensorflow as tf
|
25 |
+
from tensorflow import keras
|
26 |
+
import pandas as pd
|
27 |
+
import numpy as np
|
28 |
+
|
29 |
+
# Time
|
30 |
+
import time
|
31 |
+
import datetime
|
32 |
+
|
33 |
+
# Preprocessing
|
34 |
+
from tensorflow.keras.preprocessing.text import Tokenizer
|
35 |
+
from tensorflow.keras.preprocessing import sequence
|
36 |
+
from sklearn.preprocessing import LabelEncoder
|
37 |
+
import contractions
|
38 |
+
from bs4 import BeautifulSoup
|
39 |
+
import re
|
40 |
+
import tqdm
|
41 |
+
import unicodedata
|
42 |
+
|
43 |
+
seed = 3541
|
44 |
+
np.random.seed(seed)
|
45 |
+
|
46 |
+
# Define a dummy loss to bypass the error during model loading
|
47 |
+
def dummy_loss(y_true, y_pred):
|
48 |
+
return tf.reduce_mean(y_pred - y_true)
|
49 |
+
|
50 |
+
# Loading the model Trained on Yelp reviews
|
51 |
+
modelYelp = keras.models.load_model(
|
52 |
+
'/kaggle/input/pre-trained-model-binary-cnn-nlp-yelpreviews/tensorflow1/pre-trained-model-binary-cnn-nlp-yelp-reviews/1/Binary_Classification_90_Yelp_Reviews_CNN.h5',
|
53 |
+
compile=False
|
54 |
+
)
|
55 |
+
|
56 |
+
# Compile the model with the correct loss function and reduction
|
57 |
+
modelYelp.compile(
|
58 |
+
optimizer='adam',
|
59 |
+
loss=keras.losses.BinaryCrossentropy(reduction=tf.keras.losses.Reduction.SUM_OVER_BATCH_SIZE),
|
60 |
+
metrics=['accuracy']
|
61 |
+
)
|
62 |
+
|
63 |
+
# Loading Yelp test data
|
64 |
+
dataset_test_Yelp = pd.read_csv('/kaggle/input/yelp-reviews-for-sentianalysis-binary-np-csv/yelp_review_sa_binary_csv/test.csv')
|
65 |
+
|
66 |
+
# Loading Yelp train data (to be used on the label encoder)
|
67 |
+
dataset_train_Yelp = pd.read_csv('/kaggle/input/yelp-reviews-for-sentianalysis-binary-np-csv/yelp_review_sa_binary_csv/train.csv')
|
68 |
+
|
69 |
+
# Shuffling the Test Data
|
70 |
+
test_Yelp = dataset_test_Yelp.sample(frac=1)
|
71 |
+
train_Yelp = dataset_train_Yelp.sample(frac=1)
|
72 |
+
|
73 |
+
# Taking a tiny portion of the database (because it will only be used on the label encoder)
|
74 |
+
train_Yelp = dataset_train_Yelp.iloc[:100, :]
|
75 |
+
|
76 |
+
# Taking only necessary columns
|
77 |
+
y_test_Yelp = test_Yelp['class_index'].values
|
78 |
+
X_train_Yelp = train_Yelp['review_text'].values
|
79 |
+
y_train_Yelp = train_Yelp['class_index'].values
|
80 |
+
|
81 |
+
# Preprocess corpus function
|
82 |
+
def pre_process_corpus(corpus):
|
83 |
+
processed_corpus = []
|
84 |
+
for doc in tqdm.tqdm(corpus):
|
85 |
+
doc = contractions.fix(doc)
|
86 |
+
doc = BeautifulSoup(doc, "html.parser").get_text()
|
87 |
+
doc = unicodedata.normalize('NFKD', doc).encode('ascii', 'ignore').decode('utf-8', 'ignore')
|
88 |
+
doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
|
89 |
+
doc = doc.lower()
|
90 |
+
doc = doc.strip()
|
91 |
+
processed_corpus.append(doc)
|
92 |
+
return processed_corpus
|
93 |
+
|
94 |
+
# Preprocessing the Data
|
95 |
+
X_test_Yelp = pre_process_corpus(test_Yelp['review_text'].values)
|
96 |
+
X_train_Yelp = pre_process_corpus(X_train_Yelp)
|
97 |
+
|
98 |
+
# Creating and Fitting the Tokenizer
|
99 |
+
etc ...
|
100 |
+
|
101 |
+
# More info on the Model page on Kaggle :
|
102 |
+
|
103 |
+
https://www.kaggle.com/models/yacharki/pre-trained-model-binary-cnn-nlp-yelpreviews
|