File size: 6,153 Bytes
4cd06db
 
 
7211544
4cd06db
 
 
8bd3238
 
 
4cd06db
 
8bd3238
4cd06db
 
ec89c81
 
 
 
 
 
 
 
 
4cd06db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
243f2c2
 
 
 
 
 
 
 
 
 
 
 
 
4cd06db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import re
import emoji
import joblib
# from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import torchtext.vocab as vocab


import gradio as gr

# Using spacy.load().
import spacy
nlp = spacy.load("en_core_web_sm")

# Importing as module.
import en_core_web_sm
nlp = en_core_web_sm.load()


# Let's first load glove model
glove = vocab.GloVe(name='6B', dim=100)

def remove_html(text) : 
    patt_html = r"<.*?>"
    text = re.sub(patt_html, "", text)
    return text

def remove_url(text):
    patt_url = r"https?://\S+|www\.\S+"
    text = re.sub(patt_url, "", text)
    return text

def emoji_to_text(text) : 
    res_str = ""
    for ch in text :
        if emoji.is_emoji(ch) : 
            res_str += f" {emoji.demojize(ch)} "
            # print(ch, emoji.demojize(ch))
        else :
            res_str += ch
    return res_str

def clean_review_text(text):
    
    # remove HTML Tags
    text = remove_html(text)
    
    # remove url to call function remover_url
    text = remove_url(text) 
    
    # convert text emoji into text
    text = emoji_to_text(text)
    
    # convert all text into lower case
    text = text.lower() 
    
    # return text

    # create spacy document to remove :
    # token.is_stop => return true if word is stop word ( is, am, are, a, an, the etc )
    # token.is_punct => return true if word is punctuation ( ., !, , :, ; etc)
    # token.is_space => return true if word as a space like tab, space ..
    # token.lemma_ convert any word into root word ( go | went | gone | going => go )
    doc = nlp(text)

    clean_tokens_wds = [ token.lemma_ for token in doc if not ( token.is_stop or token.is_punct or token.is_space ) ]


    return " ".join(clean_tokens_wds)
    

    


# Main CNN model defien
class CNNHotelReviewsModel(nn.Module):
    def __init__(self, embedding_dim, n_filters, filter_sizes, output_dim, dropout):
        super().__init__()
        
        self.embedding = nn.Embedding.from_pretrained(glove.vectors, freeze=True)
        self.convs = nn.ModuleList([
            nn.Conv2d(in_channels=1, 
                      out_channels=n_filters, 
                      kernel_size=(fs, embedding_dim)) 
            for fs in filter_sizes
        ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        self.sigmoid = nn.Sigmoid()
        
    def forward(self, text):
        embedded = self.embedding(text)
        embedded = embedded.unsqueeze(1)
        conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        return self.sigmoid(self.fc(cat)).squeeze(1)




# Move model to GPU if available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

EMBEDDING_DIM = 100
OUTPUT_DIM = 1  # Positive and Negative classes

N_FILTERS = 250
FILTER_SIZES = [2, 3, 4]
DROPOUT = 0.1
# Best Hyperparameters: {'n_filters': 250, 'filter_sizes': [2, 3, 4], 'dropout': 0.1}

CNN_Model = CNNHotelReviewsModel(EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT)



# Load the saved state_dict into the model
CNN_Model.load_state_dict(torch.load("hotel_review_model.pth", map_location=device))
CNN_Model = CNN_Model.to(device)  # Move the model to GPU
CNN_Model.eval()  # Set the model to evaluation mode

# For Aspect selection lda model impored
lda_model = joblib.load('lda_model.pkl')
dictionary = joblib.load('dictionary.pkl')

# CNN prediction model
def predict_review(model, review, max_len=128):
    # Tokenize and convert to indices
    tokens = review.split()
    indices = [glove.stoi.get(token, 0) for token in tokens]
    
    # Pad or truncate to max_len
    if len(indices) < max_len:
        indices += [0] * (max_len - len(indices))
    else:
        indices = indices[:max_len]

    # Convert to tensor and add batch dimension
    tensor = torch.tensor(indices).unsqueeze(0)
    
    # Forward pass
    model.eval()  # Set model to evaluation mode
    with torch.no_grad():
        output = model(tensor.to(device))
    
    # Convert output probability to class label (0 or 1)
    prob = output.item() 
    # prediction = 1 if prob > 0.5 else 0

    return {'positive': prob, 'negative': 1-prob}



# Now On the basis of above assumsiom let's create aspect_label dictionary.
aspect_label = {
0: "Reception & Service Efficiency",
1: "Transportation & Proximity",
2: "Room Comfort & Staff Courtesy",
3: "Location & Staff Quality",
4: "Room Discrepancies",
5: "Hotel Quality vs Price",
6: "Booking & Payment Issues",
7: "Room Ambiance & Noise",
8: "Amenities & Value",
9: "Room Size & Condition",
}

def dominant_topic(text):
    text = text.split()
    bow = dictionary.doc2bow(text)
    topics = lda_model.get_document_topics(bow)
    main_topic = max(topics, key=lambda x: x[1])
    return { aspect_label[itm[0]]: float(itm[1]) for itm in topics } #main_topic[0]


def gr_fun(Review):
  review = clean_review_text(Review)
  pred_label = predict_review(CNN_Model, review)
  pred_aspect = dominant_topic(review)

  return pred_label, pred_aspect






iface = gr.Interface(
    fn=gr_fun,
    inputs="text",
    outputs=[gr.Label(), gr.Label(num_top_classes=5)],
    examples=[
        "room condition was very bad",
        "Staff where excellent and the room was lovely really great hotel will definitely be back",
        "Couldn t find ice machine The junior suite was excellent with a fantastic bar",
        "Furniture in the room was a bit worn and tired for the money you pay would just expect a bit more  it was ok",
        "Room was West facing and was far too warm particularly as the a c didn t seem to be working to well  The shower room was excellent and large enough for my lady and I to be rude in Loved it"
        ]
    )

# iface = gr.Interface(fn=return_label_aspect, inputs="text", outputs=[gr.Label(), gr.Label()])
iface.launch(inline = False)