hamza50 commited on
Commit
07681a0
1 Parent(s): 117882b

Upload 6 files

Browse files
corpus_embeddings_bi_encoder.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1854af45783940daefdea27ee8e42f026faefdc4ff4a41067c6ee4ca6eb74ade
3
+ size 64918
df_combined_paris.csv ADDED
The diff for this file is too large to render. See raw diff
 
embeddings.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3759225896afa4282dee721d96d1d1a8085cde7ccffe29e975568a5499a36548
3
+ size 64640
paris-newer.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+
5
+
6
+ @author: Hamza Farooq
7
+ """
8
+
9
+ import spacy
10
+ from spacy.lang.en.stop_words import STOP_WORDS
11
+ from string import punctuation
12
+ from collections import Counter
13
+ from heapq import nlargest
14
+ import os
15
+ nlp = spacy.load("en_core_web_sm")
16
+ from sentence_transformers import SentenceTransformer, CrossEncoder, util
17
+ import datetime
18
+
19
+ from spacy import displacy
20
+ import streamlit as st
21
+ import matplotlib.pyplot as plt
22
+ from wordcloud import WordCloud
23
+ from matplotlib import pyplot as plt
24
+
25
+ import nltk
26
+ from rank_bm25 import BM25Okapi
27
+ from sklearn.feature_extraction import _stop_words
28
+ import string
29
+ from tqdm.autonotebook import tqdm
30
+ import numpy as np
31
+ import pandas as pd
32
+ from sentence_transformers import SentenceTransformer
33
+ import scipy.spatial
34
+ import pickle
35
+ from sentence_transformers import SentenceTransformer, util
36
+ import torch
37
+
38
+
39
+
40
+
41
+
42
+ # import utils as utl
43
+
44
+ import time
45
+ import torch
46
+ import transformers
47
+ from transformers import BartTokenizer, BartForConditionalGeneration
48
+ from string import punctuation
49
+ # tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
50
+
51
+ import numpy as np
52
+ import pandas as pd
53
+ from sentence_transformers import SentenceTransformer
54
+ import scipy.spatial
55
+
56
+
57
+ from sentence_transformers import SentenceTransformer, util
58
+ import torch
59
+
60
+
61
+
62
+ def main():
63
+
64
+
65
+
66
+
67
+ # Settings
68
+ st.set_page_config(layout="wide", page_title='Paris Hotel Finder', page_icon="🎈" )
69
+ from string import punctuation
70
+ punctuation=punctuation+ '\n'
71
+
72
+
73
+ from sentence_transformers import SentenceTransformer, util
74
+ import torch
75
+ import numpy as np
76
+ import pandas as pd
77
+ from sentence_transformers import SentenceTransformer
78
+ import scipy.spatial
79
+
80
+ from sentence_transformers import SentenceTransformer, util
81
+ import torch
82
+ #import os
83
+ @st.cache(allow_output_mutation=True)
84
+ def load_model():
85
+ return SentenceTransformer('all-MiniLM-L6-v2'),SentenceTransformer('multi-qa-MiniLM-L6-cos-v1'),CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
86
+ embedder,bi_encoder,cross_encoder = load_model()
87
+
88
+
89
+
90
+
91
+ #original_title = '<p style="font-family:IBM Mono; color:Blue; font-size: 20px;">Original image</p>'
92
+ st.title("travelle - Parisian Hotel Finder")
93
+ with st.expander("ℹ️ - About this app", expanded=True):
94
+
95
+ st.write(
96
+ """
97
+ - travelle is a hotel search engine that allows users to enter free text query to make the search result personalized to user preference as opposed to other travel websites where a user has to spend hours going through hotel list.
98
+ - We use natural language processing and big data to return results customized for your preferences.
99
+ - A user can enter just about anything and we will narrow the results to what closely matches your requirements.
100
+ - For e.g. a user can enter a query like "Hotel near the Eiffel and cheaper than $300 per night with free breakfast" and we will find the closest results
101
+ """
102
+ )
103
+
104
+
105
+ punctuation=punctuation+ '\n'
106
+
107
+
108
+ #import os
109
+
110
+ # embedder = SentenceTransformer('all-MiniLM-L6-v2')
111
+
112
+
113
+
114
+ def lower_case(input_str):
115
+ input_str = input_str.lower()
116
+ return input_str
117
+
118
+ df_all = pd.read_csv('paris_clean_newer.csv')
119
+
120
+
121
+ df_combined = df_all.sort_values(['Hotel']).groupby('Hotel', sort=False).text.apply(''.join).reset_index(name='all_review')
122
+ df_combined_paris_summary = pd.read_csv('df_combined_paris.csv')
123
+ df_combined_paris_summary = df_combined_paris_summary[['Hotel','summary']]
124
+
125
+ import re
126
+
127
+ # df_combined = pd.read_csv('df_combined.csv')
128
+
129
+ df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
130
+
131
+
132
+ df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x))
133
+ df_basic = df_all[['Hotel','description','price_per_night']].drop_duplicates()
134
+ df_basic = df_basic.merge(df_combined_paris_summary,how='left')
135
+ df_combined_e = df_combined.merge(df_basic)
136
+ df_combined_e['all_review'] =df_combined_e['description']+ df_combined_e['all_review'] + df_combined_e['price_per_night']
137
+
138
+ df = df_combined_e.copy()
139
+
140
+
141
+ df_sentences = df_combined_e.set_index("all_review")
142
+
143
+ df_sentences = df_sentences["Hotel"].to_dict()
144
+ df_sentences_list = list(df_sentences.keys())
145
+
146
+
147
+
148
+ import pandas as pd
149
+ from tqdm import tqdm
150
+ from sentence_transformers import SentenceTransformer, util
151
+
152
+ df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]
153
+ #
154
+ corpus = df_sentences_list
155
+ # corpus_embeddings = embedder.encode(corpus,show_progress_bar=True)
156
+ corpus_embeddings = np.load('embeddings.npy')
157
+
158
+ bi_encoder.max_seq_length = 512 #Truncate long passages to 256 tokens
159
+ top_k = 32 #Number of passages we want to retrieve with the bi-encoder
160
+
161
+ #The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
162
+
163
+ # corpus_embeddings_h = np.load('embeddings_h_r.npy')
164
+
165
+ with open('corpus_embeddings_bi_encoder.pickle', 'rb') as pkl:
166
+ doc_embedding = pickle.load(pkl)
167
+
168
+ with open('tokenized_corpus.pickle', 'rb') as pkl:
169
+ tokenized_corpus = pickle.load(pkl)
170
+
171
+ bm25 = BM25Okapi(tokenized_corpus)
172
+ passages = corpus
173
+
174
+
175
+
176
+
177
+ # We lower case our text and remove stop-words from indexing
178
+ def bm25_tokenizer(text):
179
+ tokenized_doc = []
180
+ for token in text.lower().split():
181
+ token = token.strip(string.punctuation)
182
+
183
+ if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
184
+ tokenized_doc.append(token)
185
+ return tokenized_doc
186
+
187
+
188
+ def search(query):
189
+ # q = [str(userinput)]
190
+ doc = nlp(str(userinput))
191
+
192
+ ent_html = displacy.render(doc, style="ent", jupyter=False)
193
+ # Display the entity visualization in the browser:
194
+ st.markdown(ent_html, unsafe_allow_html=True)
195
+ ##### BM25 search (lexical search) #####
196
+ bm25_scores = bm25.get_scores(bm25_tokenizer(query))
197
+ top_n = np.argpartition(bm25_scores, -5)[-5:]
198
+ bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
199
+ bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
200
+
201
+ bm25list = {}
202
+ st.title("Top-5 lexical search (BM25) hits")
203
+ for hit in bm25_hits[0:5]:
204
+ row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]]
205
+
206
+ st.subheader(row_dict['Hotel'].values[0])
207
+ de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]]
208
+ st.write(f'\tPrice Per night: {de.price_per_night.values[0]}')
209
+ st.write(f'Description: {de.description.values[0]}')
210
+ st.expander(de.description.values[0],expanded=False)
211
+ # try:
212
+ # st.write('Summary')
213
+ # st.expander(de.summary.values[0],expanded=False)
214
+ # except:
215
+ # None
216
+ # doc = corpus[hit['corpus_id']]
217
+ # kp.get_key_phrases(doc)
218
+
219
+ bm25list[row_dict['Hotel'].values[0]] = de.description.values[0][0:200]
220
+
221
+ #### Sematic Search #####
222
+ # Encode the query using the bi-encoder and find potentially relevant passages
223
+ question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
224
+ # question_embedding = question_embedding.cuda()
225
+ hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
226
+ hits = hits[0] # Get the hits for the first query
227
+
228
+ ##### Re-Ranking #####
229
+ # Now, score all retrieved passages with the cross_encoder
230
+ cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
231
+ cross_scores = cross_encoder.predict(cross_inp)
232
+
233
+ # Sort results by the cross-encoder scores
234
+ for idx in range(len(cross_scores)):
235
+ hits[idx]['cross-score'] = cross_scores[idx]
236
+
237
+ # Output of top-5 hits from bi-encoder
238
+ st.write("\n-------------------------\n")
239
+ st.title("Top-5 Bi-Encoder Retrieval hits")
240
+ hits = sorted(hits, key=lambda x: x['score'], reverse=True)
241
+ for hit in hits[0:5]:
242
+ # st.write("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))
243
+ row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]]
244
+ st.subheader(row_dict['Hotel'].values[0])
245
+ de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]]
246
+ st.write(f'\tPrice Per night: {de.price_per_night.values[0]}')
247
+ st.write(f'Description: {de.description.values[0]}')
248
+ st.expander(de.description.values[0])
249
+ # try:
250
+ # st.write('Summary')
251
+ # st.expander(de.summary.values[0],expanded=False)
252
+ # except:
253
+ # None
254
+
255
+ # Output of top-5 hits from re-ranker
256
+ st.write("\n-------------------------\n")
257
+ st.title("Top-5 Cross-Encoder Re-ranker hits")
258
+ hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
259
+ for hit in hits[0:5]:
260
+ # st.write("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))
261
+ row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]]
262
+ st.subheader(row_dict['Hotel'].values[0])
263
+ de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]]
264
+ st.write(f'\tPrice Per night: {de.price_per_night.values[0]}')
265
+ st.write(f'Description: {de.description.values[0]}')
266
+ st.expander(de.description.values[0])
267
+ # try:
268
+ # st.write('Summary')
269
+ # st.expander(de.summary.values[0],expanded=False)
270
+ # except:
271
+ # None
272
+
273
+
274
+
275
+
276
+ sampletext = 'e.g. Hotel near Eiffel Tower with big rooms'
277
+ userinput = st.text_input('Tell us what are you looking in your hotel?','e.g. Hotel near Eiffel Tower with big rooms',autocomplete="on")
278
+ da = st.date_input(
279
+ "Date Check-in",
280
+ datetime.date(2023, 6, 3))
281
+
282
+ dst = st.date_input(
283
+ "Date Check-out",
284
+ datetime.date(2023, 6, 8))
285
+
286
+
287
+ if not userinput or userinput == sampletext:
288
+ st.write("Please enter a query to get results")
289
+ else:
290
+ query = [str(userinput)]
291
+ doc = nlp(str(userinput))
292
+ search(str(userinput))
293
+
294
+ # We use cosine-similarity and torch.topk to find the highest 5 scores
295
+
296
+ if __name__ == '__main__':
297
+ main()
paris_clean_newer.csv ADDED
The diff for this file is too large to render. See raw diff
 
tokenized_corpus.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e99b20be01f7889248d5b3f667df8947ae6ca676f3a525717305e5124c8b739e
3
+ size 1261235