import re import emoji import joblib # from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, f1_score import torch import torch.nn as nn import torch.nn.functional as F from torch.utils.data import Dataset, DataLoader import torchtext.vocab as vocab import gradio as gr # Using spacy.load(). import spacy nlp = spacy.load("en_core_web_sm") # Importing as module. import en_core_web_sm nlp = en_core_web_sm.load() # Let's first load glove model glove = vocab.GloVe(name='6B', dim=100) def remove_html(text) : patt_html = r"<.*?>" text = re.sub(patt_html, "", text) return text def remove_url(text): patt_url = r"https?://\S+|www\.\S+" text = re.sub(patt_url, "", text) return text def emoji_to_text(text) : res_str = "" for ch in text : if emoji.is_emoji(ch) : res_str += f" {emoji.demojize(ch)} " # print(ch, emoji.demojize(ch)) else : res_str += ch return res_str def clean_review_text(text): # remove HTML Tags text = remove_html(text) # remove url to call function remover_url text = remove_url(text) # convert text emoji into text text = emoji_to_text(text) # convert all text into lower case text = text.lower() # return text # create spacy document to remove : # token.is_stop => return true if word is stop word ( is, am, are, a, an, the etc ) # token.is_punct => return true if word is punctuation ( ., !, , :, ; etc) # token.is_space => return true if word as a space like tab, space .. # token.lemma_ convert any word into root word ( go | went | gone | going => go ) doc = nlp(text) clean_tokens_wds = [ token.lemma_ for token in doc if not ( token.is_stop or token.is_punct or token.is_space ) ] return " ".join(clean_tokens_wds) # Main CNN model defien class CNNHotelReviewsModel(nn.Module): def __init__(self, embedding_dim, n_filters, filter_sizes, output_dim, dropout): super().__init__() self.embedding = nn.Embedding.from_pretrained(glove.vectors, freeze=True) self.convs = nn.ModuleList([ nn.Conv2d(in_channels=1, out_channels=n_filters, kernel_size=(fs, embedding_dim)) for fs in filter_sizes ]) self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim) self.dropout = nn.Dropout(dropout) self.sigmoid = nn.Sigmoid() def forward(self, text): embedded = self.embedding(text) embedded = embedded.unsqueeze(1) conved = [F.relu(conv(embedded)).squeeze(3) for conv in self.convs] pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved] cat = self.dropout(torch.cat(pooled, dim=1)) return self.sigmoid(self.fc(cat)).squeeze(1) # Move model to GPU if available device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') EMBEDDING_DIM = 100 OUTPUT_DIM = 1 # Positive and Negative classes N_FILTERS = 250 FILTER_SIZES = [2, 3, 4] DROPOUT = 0.1 # Best Hyperparameters: {'n_filters': 250, 'filter_sizes': [2, 3, 4], 'dropout': 0.1} CNN_Model = CNNHotelReviewsModel(EMBEDDING_DIM, N_FILTERS, FILTER_SIZES, OUTPUT_DIM, DROPOUT) # Load the saved state_dict into the model CNN_Model.load_state_dict(torch.load("hotel_review_model.pth", map_location=device)) CNN_Model = CNN_Model.to(device) # Move the model to GPU CNN_Model.eval() # Set the model to evaluation mode # For Aspect selection lda model impored lda_model = joblib.load('lda_model.pkl') dictionary = joblib.load('dictionary.pkl') # CNN prediction model def predict_review(model, review, max_len=128): # Tokenize and convert to indices tokens = review.split() indices = [glove.stoi.get(token, 0) for token in tokens] # Pad or truncate to max_len if len(indices) < max_len: indices += [0] * (max_len - len(indices)) else: indices = indices[:max_len] # Convert to tensor and add batch dimension tensor = torch.tensor(indices).unsqueeze(0) # Forward pass model.eval() # Set model to evaluation mode with torch.no_grad(): output = model(tensor.to(device)) # Convert output probability to class label (0 or 1) prob = output.item() # prediction = 1 if prob > 0.5 else 0 return {'positive': prob, 'negative': 1-prob} # Now On the basis of above assumsiom let's create aspect_label dictionary. aspect_label = { 0: "Reception & Service Efficiency", 1: "Transportation & Proximity", 2: "Room Comfort & Staff Courtesy", 3: "Location & Staff Quality", 4: "Room Discrepancies", 5: "Hotel Quality vs Price", 6: "Booking & Payment Issues", 7: "Room Ambiance & Noise", 8: "Amenities & Value", 9: "Room Size & Condition", } def dominant_topic(text): text = text.split() bow = dictionary.doc2bow(text) topics = lda_model.get_document_topics(bow) main_topic = max(topics, key=lambda x: x[1]) return { aspect_label[itm[0]]: float(itm[1]) for itm in topics } #main_topic[0] def gr_fun(Review): review = clean_review_text(Review) pred_label = predict_review(CNN_Model, review) pred_aspect = dominant_topic(review) return pred_label, pred_aspect iface = gr.Interface( fn=gr_fun, inputs="text", outputs=[gr.Label(), gr.Label(num_top_classes=5)], examples=[ "room condition was very bad", "Staff where excellent and the room was lovely really great hotel will definitely be back", "Couldn t find ice machine The junior suite was excellent with a fantastic bar", "Furniture in the room was a bit worn and tired for the money you pay would just expect a bit more it was ok", "Room was West facing and was far too warm particularly as the a c didn t seem to be working to well The shower room was excellent and large enough for my lady and I to be rude in Loved it" ] ) # iface = gr.Interface(fn=return_label_aspect, inputs="text", outputs=[gr.Label(), gr.Label()]) iface.launch(inline = False)