hamza50 commited on
Commit
27cc973
1 Parent(s): dd2e37f

Upload 23 files

Browse files
.gitattributes CHANGED
@@ -31,3 +31,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
31
  *.zip filter=lfs diff=lfs merge=lfs -text
32
  *.zst filter=lfs diff=lfs merge=lfs -text
33
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
31
  *.zip filter=lfs diff=lfs merge=lfs -text
32
  *.zst filter=lfs diff=lfs merge=lfs -text
33
  *tfevents* filter=lfs diff=lfs merge=lfs -text
34
+ en_core_web_sm-3.2.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text
Dockerfile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Base Image to use
2
+ FROM python:3.7.9
3
+
4
+ #Expose port 8080
5
+ EXPOSE 8080
6
+
7
+ #Copy Requirements.txt file into app directory
8
+ COPY requirements.txt app/requirements.txt
9
+
10
+ #install all requirements in requirements.txt
11
+ RUN pip3 install -r app/requirements.txt
12
+
13
+ #Copy all files in current directory into app directory
14
+ COPY . /app
15
+
16
+ #Change Working Directory to app directory
17
+ WORKDIR /app
18
+
19
+ #Run the application on port 8080
20
+ ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8080", "--server.address=0.0.0.0"]
Hotel New York Combined.csv ADDED
The diff for this file is too large to render. See raw diff
 
README.md CHANGED
@@ -1,13 +1 @@
1
- ---
2
- title: Hotelfinder
3
- emoji: 🚀
4
- colorFrom: purple
5
- colorTo: yellow
6
- sdk: gradio
7
- sdk_version: 3.6
8
- app_file: app.py
9
- pinned: false
10
- license: creativeml-openrail-m
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ assignment3
 
 
 
 
 
 
 
 
 
 
 
 
Untitled.ipynb ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "611a3e0e",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "name": "stdout",
11
+ "output_type": "stream",
12
+ "text": [
13
+ "Init Plugin\n",
14
+ "Init Graph Optimizer\n",
15
+ "Init Kernel\n",
16
+ "Collecting en-core-web-sm==3.2.0\n",
17
+ " Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)\n",
18
+ " |████████████████████████████████| 13.9 MB 463 kB/s \n",
19
+ "\u001b[?25hRequirement already satisfied: spacy<3.3.0,>=3.2.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from en-core-web-sm==3.2.0) (3.2.1)\n",
20
+ "Requirement already satisfied: blis<0.8.0,>=0.4.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.7.5)\n",
21
+ "Requirement already satisfied: thinc<8.1.0,>=8.0.12 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (8.0.13)\n",
22
+ "Requirement already satisfied: srsly<3.0.0,>=2.4.1 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.4.2)\n",
23
+ "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (4.62.3)\n",
24
+ "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.6)\n",
25
+ "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.6)\n",
26
+ "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.6)\n",
27
+ "Requirement already satisfied: packaging>=20.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (21.0)\n",
28
+ "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.3.0)\n",
29
+ "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.8 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.8)\n",
30
+ "Requirement already satisfied: jinja2 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.1)\n",
31
+ "Requirement already satisfied: numpy>=1.15.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.21.4)\n",
32
+ "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.8.2)\n",
33
+ "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.0.1)\n",
34
+ "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.0.6)\n",
35
+ "Requirement already satisfied: wasabi<1.1.0,>=0.8.1 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.8.2)\n",
36
+ "Requirement already satisfied: pathy>=0.3.5 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.6.1)\n",
37
+ "Requirement already satisfied: typer<0.5.0,>=0.3.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.4.0)\n",
38
+ "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.26.0)\n",
39
+ "Requirement already satisfied: setuptools in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (59.0.1)\n",
40
+ "Requirement already satisfied: pyparsing>=2.0.2 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from packaging>=20.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.4.7)\n",
41
+ "Requirement already satisfied: smart-open<6.0.0,>=5.0.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from pathy>=0.3.5->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (5.2.1)\n",
42
+ "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (4.1.1)\n",
43
+ "Requirement already satisfied: idna<4,>=2.5 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.2)\n",
44
+ "Requirement already satisfied: certifi>=2017.4.17 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2021.5.30)\n",
45
+ "Requirement already satisfied: charset-normalizer~=2.0.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.4)\n",
46
+ "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.26.6)\n",
47
+ "Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from typer<0.5.0,>=0.3.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (8.0.4)\n",
48
+ "Requirement already satisfied: MarkupSafe>=2.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from jinja2->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.1)\n",
49
+ "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n",
50
+ "You should consider upgrading via the '/opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/bin/python -m pip install --upgrade pip' command.\u001b[0m\n",
51
+ "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
52
+ "You can now load the package via spacy.load('en_core_web_sm')\n"
53
+ ]
54
+ }
55
+ ],
56
+ "source": [
57
+ "!python -m spacy download en_core_web_sm"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "code",
62
+ "execution_count": null,
63
+ "id": "51a414e5",
64
+ "metadata": {},
65
+ "outputs": [],
66
+ "source": []
67
+ }
68
+ ],
69
+ "metadata": {
70
+ "kernelspec": {
71
+ "display_name": "Python 3.9.5 64-bit ('tensorflow': conda)",
72
+ "language": "python",
73
+ "name": "python395jvsc74a57bd04bd624a0593993fe43ac4046b27b898fb2ef75c21c08f81e89e64ea0f51df676"
74
+ },
75
+ "language_info": {
76
+ "codemirror_mode": {
77
+ "name": "ipython",
78
+ "version": 3
79
+ },
80
+ "file_extension": ".py",
81
+ "mimetype": "text/x-python",
82
+ "name": "python",
83
+ "nbconvert_exporter": "python",
84
+ "pygments_lexer": "ipython3",
85
+ "version": "3.9.5"
86
+ }
87
+ },
88
+ "nbformat": 4,
89
+ "nbformat_minor": 5
90
+ }
app.py ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+
5
+
6
+ @author: Hamza Farooq
7
+ """
8
+
9
+ import spacy
10
+ from spacy.lang.en.stop_words import STOP_WORDS
11
+ from string import punctuation
12
+ from collections import Counter
13
+ from heapq import nlargest
14
+ import os
15
+ nlp = spacy.load("en_core_web_sm")
16
+ from spacy import displacy
17
+ import streamlit as st
18
+ import matplotlib.pyplot as plt
19
+ from wordcloud import WordCloud
20
+ from matplotlib import pyplot as plt
21
+ import nltk
22
+ nltk.download('stopwords')
23
+ import geonamescache
24
+
25
+ import os
26
+ import streamlit as st
27
+ import utils as utl
28
+ from PIL import Image
29
+ import time
30
+ import torch
31
+ import transformers
32
+ from transformers import BartTokenizer, BartForConditionalGeneration
33
+ tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
34
+ mdl = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
35
+ torch_device = 'gpu'
36
+
37
+
38
+ def main():
39
+ # Settings
40
+ st.set_page_config(layout="wide", page_title='New York Hotels')
41
+ def bart_summarize(text, num_beams=20, length_penalty=2, max_length=2048, min_length=56, no_repeat_ngram_size=2):
42
+
43
+ text = text.replace('\n','')
44
+ text_input_ids = tr.batch_encode_plus([text], return_tensors='pt', max_length=1024)['input_ids'].to(torch_device)
45
+ summary_ids = mdl.generate(text_input_ids, num_beams=int(num_beams), length_penalty=float(length_penalty), max_length=int(max_length), min_length=int(min_length), no_repeat_ngram_size=int(no_repeat_ngram_size))
46
+ summary_txt = tr.decode(summary_ids.squeeze(), skip_special_tokens=True)
47
+ return summary_txt
48
+
49
+
50
+ gc = geonamescache.GeonamesCache()
51
+
52
+ # gets nested dictionary for countries
53
+ countries = gc.get_countries()
54
+
55
+ # gets nested dictionary for cities
56
+ cities = gc.get_cities()
57
+ # def gen_dict_extract(var, key):
58
+ # if isinstance(var, dict):
59
+ # for k, v in var.items():
60
+ # if k == key:
61
+ # yield v
62
+ # if isinstance(v, (dict, list)):
63
+ # yield from gen_dict_extract(v, key)
64
+ # elif isinstance(var, list):
65
+ # for d in var:
66
+ # yield from gen_dict_extract(d, key)
67
+ #
68
+ # cities = [*gen_dict_extract(cities, 'name')]
69
+ # countries = [*gen_dict_extract(countries, 'name')]
70
+ #
71
+ # cities.append('New York')
72
+
73
+ from nltk.corpus import stopwords
74
+
75
+ stopwords = set(stopwords.words('english'))
76
+ #mask = np.array(Image.open('upvote.png'))
77
+
78
+ from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
79
+ import matplotlib.pyplot as plt
80
+ #original_title = '<p style="font-family:IBM Mono; color:Blue; font-size: 20px;">Original image</p>'
81
+ st.title("New York Hotel Finder")
82
+
83
+
84
+ stopwords=list(STOP_WORDS)
85
+ stopwords.extend(['hotel','room','rooms'])
86
+ from string import punctuation
87
+ punctuation=punctuation+ '\n'
88
+
89
+ import pandas as pd
90
+ from sentence_transformers import SentenceTransformer
91
+ import scipy.spatial
92
+ import pickle as pkl
93
+ from sentence_transformers import SentenceTransformer, util
94
+ import torch
95
+ #import os
96
+
97
+ embedder = SentenceTransformer('all-MiniLM-L6-v2')
98
+
99
+ df_all = pd.read_csv('Hotel New York Combined.csv')
100
+
101
+ df_all = df_all[['hotel_name','review_body']]
102
+ #
103
+ # df['hotel_name'].drop_duplicates()
104
+
105
+ # df_combined = df.sort_values(['hotel_name']).groupby('hotel_name', sort=False).review_body.apply(''.join).reset_index(name='all_review')
106
+
107
+ import re
108
+
109
+ df_combined = pd.read_csv('df_combined.csv')
110
+
111
+ # df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
112
+ #
113
+ # def lower_case(input_str):
114
+ # input_str = input_str.lower()
115
+ # return input_str
116
+ #
117
+ # df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x))
118
+
119
+ df = df_combined
120
+
121
+ df_sentences = df_combined.set_index("all_review")
122
+
123
+ df_sentences = df_sentences["hotel_name"].to_dict()
124
+ df_sentences_list = list(df_sentences.keys())
125
+
126
+ import pandas as pd
127
+ from tqdm import tqdm
128
+ from sentence_transformers import SentenceTransformer, util
129
+
130
+ df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]
131
+ #
132
+ corpus = df_sentences_list
133
+ corpus_embeddings = embedder.encode(corpus,show_progress_bar=True)
134
+ #
135
+ # model = SentenceTransformer('all-MiniLM-L6-v2')
136
+ # paraphrases = util.paraphrase_mining(model, corpus)
137
+
138
+ #queries = ['Hotel close to Central Park',
139
+ # 'Hotel with breakfast'
140
+ # ]
141
+
142
+
143
+ # from transformers import AutoTokenizer, AutoModel
144
+ # import torch
145
+ # import torch.nn.functional as F
146
+ #
147
+ # #Mean Pooling - Take attention mask into account for correct averaging
148
+ # def mean_pooling(model_output, attention_mask):
149
+ # token_embeddings = model_output[0] #First element of model_output contains all token embeddings
150
+ # input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
151
+ # return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
152
+ #
153
+ #
154
+ # # Sentences we want sentence embeddings for
155
+ # sentences = corpus
156
+ #
157
+ # # Load model from HuggingFace Hub
158
+ # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v1')
159
+ # model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v1')
160
+ #
161
+ # # Tokenize sentences
162
+ # encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
163
+ #
164
+ # # Compute token embeddings
165
+ # with torch.no_grad():
166
+ # model_output = model(**encoded_input)
167
+ #
168
+ # # Perform pooling
169
+ # sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
170
+ #
171
+ # # Normalize embeddings
172
+ # sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
173
+ #
174
+ # st.text("Sentence embeddings:")
175
+ # st.text(sentence_embeddings)
176
+ #
177
+ #
178
+
179
+ #corpus_embeddings = sentence_embeddings
180
+ # Query sentences
181
+
182
+ def plot_cloud(wordcloud):
183
+ # Set figure size
184
+ st.pyplot.figure(figsize=(40, 30))
185
+ # Display image
186
+ st.pyplot(wordcloud)
187
+ # No axis details
188
+ #st.pyplot.axis("off");
189
+ userinput = st.text_input('Tell us what are you looking in your hotel?')
190
+ if not userinput:
191
+ st.write("Please enter a query to get results")
192
+ else:
193
+ query = [str(userinput)]
194
+ doc = nlp(str(userinput))
195
+ for ent in doc.ents:
196
+ if ent.label_ == 'GPE':
197
+ if ent.text in countries:
198
+ st.write(f"Country : {ent.text}")
199
+ elif ent.text in cities:
200
+ st.write("city")
201
+ st.write(ent.text)
202
+ st.write(f"City : {ent.text}")
203
+ else:
204
+ print(f"Other GPE : {ent.text}")
205
+ # query_embeddings = embedder.encode(queries,show_progress_bar=True)
206
+ top_k = min(5, len(corpus))
207
+
208
+ query_embedding = embedder.encode(query, convert_to_tensor=True)
209
+
210
+ # We use cosine-similarity and torch.topk to find the highest 5 scores
211
+ cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
212
+ top_results = torch.topk(cos_scores, k=top_k)
213
+
214
+ # st.write("\n\n======================\n\n")
215
+ # st.write("Query:", query)
216
+ # # doc = nlp(query)
217
+ sentence_spans = list(doc.sents)
218
+ ent_html = displacy.render(doc, style="ent", jupyter=False)
219
+ # Display the entity visualization in the browser:
220
+ st.markdown(ent_html, unsafe_allow_html=True)
221
+
222
+ #displacy.render(doc, jupyter = True, style="ent")
223
+ st.write("##")
224
+ st.subheader("\n\n\n\n\n\nTop 5 most relevant hotels:\n\n\n\n\n\n\n")
225
+ st.write("\n\n======================\n\n")
226
+
227
+ for score, idx in zip(top_results[0], top_results[1]):
228
+
229
+ row_dict = df.loc[df['all_review']== corpus[idx]]
230
+ st.subheader(row_dict['hotel_name'].values[0])
231
+ hotel_subset = df_all.loc[df_all['hotel_name']==row_dict['hotel_name'].values[0]]
232
+ st.caption("Review Summary:")
233
+ st.write(row_dict['summary'].values[0])
234
+ st.caption("Relevancy: {:.4f}".format(score))
235
+ st.caption("Relevant reviews:")
236
+
237
+ df_sentences_h = hotel_subset.set_index("review_body")
238
+
239
+ df_sentences_h = df_sentences_h["hotel_name"].to_dict()
240
+ df_sentences_list_h = list(df_sentences_h.keys())
241
+
242
+
243
+
244
+ df_sentences_list_h = [str(d) for d in tqdm(df_sentences_list_h)]
245
+ #
246
+ corpus_h = df_sentences_list_h
247
+ corpus_embeddings_h = embedder.encode(corpus_h,show_progress_bar=True)
248
+ cos_scores_h = util.pytorch_cos_sim(query_embedding, corpus_embeddings_h)[0]
249
+ top_results_h = torch.topk(cos_scores_h, k=top_k)
250
+
251
+ for score, idx in zip(top_results_h[0], top_results_h[1]):
252
+ st.write(corpus_h[idx])
253
+
254
+ # st.table(hotel_subset.head())
255
+
256
+ # st.write("#")
257
+ #wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='navy', colormap='rainbow', collocations=False, stopwords = STOPWORDS, mask=mask).generate(corpus[idx])
258
+ # wordcloud = WordCloud(collocations=False,stopwords=stopwords,background_color='black',max_words=35).generate(corpus[idx])
259
+ # fig, ax = plt.subplots()
260
+ # plt.imshow(wordcloud, interpolation='bilinear')
261
+ # plt.axis("off")
262
+ # plt.show()
263
+ # st.pyplot(fig)
264
+ # st.set_option('deprecation.showPyplotGlobalUse', False)
265
+
266
+
267
+ if __name__ == '__main__':
268
+ main()
269
+
270
+
271
+ # cos_scores = util.pytorch_cos_sim(query_embedding, sentence_embeddings)[0]
272
+ # top_results = torch.topk(cos_scores, k=top_k)
273
+
274
+ # st.write("\n\n======================\n\n")
275
+ # st.write("Query:", query)
276
+ # st.write("\nTop 5 most similar sentences in corpus using sentence embedding:")
277
+ #
278
+ # for score, idx in zip(top_results[0], top_results[1]):
279
+ # st.write("(Score: {:.4f})".format(score))
280
+ # row_dict = df.loc[df['all_review']== corpus[idx]]
281
+ # st.write("paper_id: " , row_dict['hotel_name'] , "\n")
282
+ # #wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='navy', colormap='rainbow', collocations=False, stopwords = STOPWORDS, mask=mask).generate(corpus[idx])
283
+ # wordcloud = WordCloud(collocations=False,stopwords=stopwords,background_color='black',max_words=35).generate(corpus[idx])
284
+ # fig, ax = plt.subplots()
285
+ # plt.imshow(wordcloud, interpolation='bilinear')
286
+ # plt.axis("off")
287
+ # plt.show()
288
+ # st.pyplot(fig)
289
+ # st.set_option('deprecation.showPyplotGlobalUse', False)
290
+
291
+
292
+ # embedder = SentenceTransformer('all-MiniLM-L6-v2')
293
+ #
294
+ # corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)
295
+
296
+
297
+ # Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
app.yaml ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ service: default
2
+ runtime: custom
3
+ env: flex
basic.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ from datetime import datetime
3
+ from time import time
4
+ from lxml import html,etree
5
+ from reviews_final import scrape, write_in_csv
6
+ import pandas as pd
7
+ import requests,re
8
+ import os,sys
9
+ import unicodecsv as csv
10
+ import argparse
11
+ import numpy as np
12
+ import json
13
+ def clean(text):
14
+ if text:
15
+ # Removing \n \r and \t
16
+ return ' '.join(''.join(text).split()).strip()
17
+ return None
18
+
19
+
20
+
21
+
22
+ def parse(locality,checkin_date,checkout_date,sort):
23
+ checkIn = checkin_date.strftime("%Y/%m/%d")
24
+ checkOut = checkout_date.strftime("%Y/%m/%d")
25
+ print ("Scraper Inititated for Locality:%s"%locality)
26
+ header = {
27
+
28
+ 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
29
+ }
30
+ # TA rendering the autocomplete list using this API
31
+ print ("Finding search result page URL")
32
+ geo_url = 'https://www.tripadvisor.com/TypeAheadJson?action=API&startTime='+str(int(time()))+'&uiOrigin=GEOSCOPE&source=GEOSCOPE&interleaved=true&types=geo,theme_park&neighborhood_geos=true&link_type=hotel&details=true&max=12&injectNeighborhoods=true&query='+locality
33
+ api_response = requests.get(geo_url,headers=header, timeout=120).json()
34
+ #getting the TA url for th equery from the autocomplete response
35
+ url_from_autocomplete = "http://www.tripadvisor.com"+api_response['results'][0]['url']
36
+ print ('URL found %s'%url_from_autocomplete)
37
+ geo = api_response['results'][0]['value']
38
+ #Formating date for writing to file
39
+ a=url_from_autocomplete
40
+ b=a.split("-")
41
+ s="-"
42
+ c=s.join([b[0],b[1],"oa30",b[2],b[3]])
43
+ d=s.join([b[0],b[1],"oa60",b[2],b[3]])
44
+ e=s.join([b[0],b[1],"oa90",b[2],b[3]])
45
+ f=s.join([b[0],b[1],"oa120",b[2],b[3]])
46
+ urllist = [a,c,d,e,f]
47
+
48
+ date = checkin_date.strftime("%Y_%m_%d")+"_"+checkout_date.strftime("%Y_%m_%d")
49
+ #form data to get the hotels list from TA for the selected date
50
+ form_data = {'changeSet': 'TRAVEL_INFO',
51
+ 'showSnippets': 'false',
52
+ 'staydates':date,
53
+ 'uguests': '2',
54
+ 'sortOrder':sort
55
+
56
+ }
57
+
58
+
59
+
60
+ json_arr = []
61
+ for url_from_autocomplete in urllist:
62
+ print(url_from_autocomplete)
63
+
64
+ headers = {
65
+ 'Accept': 'text/javascript, text/html, application/xml, text/xml, */*',
66
+ 'Accept-Encoding': 'gzip,deflate',
67
+ 'Accept-Language': 'en-US,en;q=0.5',
68
+ 'Cache-Control': 'no-cache',
69
+ 'Connection': 'keep-alive',
70
+ 'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
71
+ 'Host': 'www.tripadvisor.com',
72
+ 'Pragma': 'no-cache',
73
+ 'Referer': url_from_autocomplete,
74
+ 'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:28.0) Gecko/20100101 Firefox/28.0',
75
+ 'X-Requested-With': 'XMLHttpRequest'
76
+ }
77
+ cookies= {"SetCurrency":"USD"}
78
+ print ("Downloading search results page")
79
+ page_response = requests.post(url = url_from_autocomplete,data=form_data,headers = headers, cookies = cookies, verify=False)
80
+ print ("Parsing results ")
81
+ parser = html.fromstring(page_response.text)
82
+ hotel_lists = parser.xpath('//div[contains(@class,"listItem")]//div[contains(@class,"listing collapsed")]')
83
+ hotel_data = []
84
+ if not hotel_lists:
85
+ hotel_lists = parser.xpath('//div[contains(@class,"listItem")]//div[@class="listing "]')
86
+
87
+ for hotel in hotel_lists:
88
+ XPATH_HOTEL_LINK = './/a[contains(@class,"property_title")]/@href'
89
+ XPATH_REVIEWS = './/a[@class="review_count"]//text()'
90
+ XPATH_RANK = './/div[@class="popindex"]//text()'
91
+ XPATH_RATING = './/span[contains(@class,"ui_bubble_rating bubble_45")]/@alt' #update this code to get rating
92
+ XPATH_RATING_2 = './/a[contains(@class,"ui_bubble_rating bubble_45")]/@alt' #update this code to get rating
93
+ XPATH_HOTEL_NAME = './/a[contains(@class,"property_title")]//text()'
94
+ XPATH_HOTEL_FEATURES = './/div[contains(@casls,"common_hotel_icons_list")]//li//text()'
95
+ XPATH_HOTEL_PRICE = './/div[contains(@data-sizegroup,"mini-meta-price")]/text()'
96
+ XPATH_VIEW_DEALS = './/div[contains(@data-ajax-preserve,"viewDeals")]//text()'
97
+ XPATH_BOOKING_PROVIDER = './/div[contains(@data-sizegroup,"mini-meta-provider")]//text()' #<span class="dekGp Ci _R S4 H3 MD">#74 of 319 hotels in Lisbon</span><span class="dekGp Ci _R S4 H3 MD">#6 of 319 hotels in Lisbon</span>
98
+ XPATH_RATING_ORDER = './/span[contains(@class,"dekGp Ci _R S4 H3 MD")]//text()'
99
+ XPATH_OFFICIAL_DESCRIPTION = '//div[contains(text(),"Description")]/following-sibling::div//span[contains(@class,"introText")]/text()'
100
+
101
+
102
+ raw_booking_provider = hotel.xpath(XPATH_BOOKING_PROVIDER)
103
+ raw_no_of_deals = hotel.xpath(XPATH_VIEW_DEALS)
104
+ raw_hotel_link = hotel.xpath(XPATH_HOTEL_LINK)
105
+ raw_no_of_reviews = hotel.xpath(XPATH_REVIEWS)
106
+ raw_rank = hotel.xpath(XPATH_RANK)
107
+ raw_rating = hotel.xpath(XPATH_RATING_2)
108
+ raw_hotel_name = hotel.xpath(XPATH_HOTEL_NAME)
109
+ raw_hotel_features = hotel.xpath(XPATH_HOTEL_FEATURES)
110
+ raw_hotel_price_per_night = hotel.xpath(XPATH_HOTEL_PRICE)
111
+ raw_rank_order = hotel.xpath(XPATH_RATING_ORDER)
112
+ raw_official_description = parser.xpath(XPATH_OFFICIAL_DESCRIPTION)
113
+
114
+ url = 'http://www.tripadvisor.com'+raw_hotel_link[0] if raw_hotel_link else None
115
+ reviews = ''.join(raw_no_of_reviews).replace("reviews","").replace(",","") if raw_no_of_reviews else 0
116
+ rank = ''.join(raw_rank) if raw_rank else None
117
+ rating = ''.join(raw_rating).replace('of 5 bubbles','').strip() if raw_rating else None
118
+ name = ''.join(raw_hotel_name).strip() if raw_hotel_name else None
119
+ hotel_features = ','.join(raw_hotel_features)
120
+ #price_per_night = ''.join(raw_hotel_price_per_night).encode('utf-8').replace('\n','') if raw_hotel_price_per_night else None
121
+ price_per_night = ''.join(raw_hotel_price_per_night).replace('\n','') if raw_hotel_price_per_night else None
122
+ rank_order = ''.join(raw_rank_order) if raw_rank_order else None
123
+ no_of_deals = re.findall("all\s+?(\d+)\s+?",''.join(raw_no_of_deals))
124
+ booking_provider = ''.join(raw_booking_provider).strip() if raw_booking_provider else None
125
+ official_description = clean(raw_official_description)
126
+
127
+ if no_of_deals:
128
+ no_of_deals = no_of_deals[0]
129
+ else:
130
+ no_of_deals = 0
131
+
132
+ data = {
133
+ 'hotel_name':name,
134
+ 'url':url,
135
+ 'locality':locality,
136
+ 'reviews':reviews,
137
+ 'rank':rank,
138
+ 'tripadvisor_rating':rating,
139
+ 'checkOut':checkOut,
140
+ 'checkIn':checkIn,
141
+ 'hotel_features':hotel_features,
142
+ 'price_per_night':price_per_night,
143
+ 'no_of_deals':no_of_deals,
144
+ 'booking_provider':booking_provider,
145
+ 'raw_rank': rank_order,
146
+ 'desc':official_description
147
+
148
+ }
149
+
150
+
151
+ if data:
152
+ print("Writing scraped data")
153
+ json_arr.append(data)
154
+ with open('data_file.json', 'w') as outfile:
155
+ json.dump(json_arr, outfile)
156
+ # hotel_data.append(data)
157
+ # all_hotel.append(data)
158
+ # #Referrer is necessary to get the correct response from TA if not provided they will redirect to home page
159
+ # my_df=pd.DataFrame(all_hotel)
160
+ # print(my_df['hotel_name'])
161
+
162
+
163
+
164
+
165
+
166
+ return urllist
combined_paris.csv ADDED
The diff for this file is too large to render. See raw diff
 
corpus_embeddings_bi_encoder.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1854af45783940daefdea27ee8e42f026faefdc4ff4a41067c6ee4ca6eb74ade
3
+ size 64918
corpus_embeddings_bi_encoder.pickle 2 ADDED
Binary file (64.9 kB). View file
 
df_combined.csv ADDED
The diff for this file is too large to render. See raw diff
 
df_combined_paris.csv ADDED
The diff for this file is too large to render. See raw diff
 
embeddings.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3759225896afa4282dee721d96d1d1a8085cde7ccffe29e975568a5499a36548
3
+ size 64640
embeddings_h_r.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:76ae4840488129cd4c6917830018219292cca514e62c69ea9e507b185d219aa7
3
+ size 4391552
embeddings_review.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:96cee6d14a75d19eccbe9decb501dd3c5de6c1fe401d3803a82611f075a8a6a8
3
+ size 144512
en_core_web_sm-3.2.0-py3-none-any.whl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e02939fb7fbae6dbcc9c5a1355f5e4e02939b649a1f0846ee844ac1d479bbeb
3
+ size 13900196
paris-newer.py ADDED
@@ -0,0 +1,295 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+
5
+
6
+ @author: Hamza Farooq
7
+ """
8
+
9
+ import spacy
10
+ from spacy.lang.en.stop_words import STOP_WORDS
11
+ from string import punctuation
12
+ from collections import Counter
13
+ from heapq import nlargest
14
+ import os
15
+ nlp = spacy.load("en_core_web_sm")
16
+ from sentence_transformers import SentenceTransformer, CrossEncoder, util
17
+ import datetime
18
+
19
+ from spacy import displacy
20
+ import streamlit as st
21
+ import matplotlib.pyplot as plt
22
+ from wordcloud import WordCloud
23
+ from matplotlib import pyplot as plt
24
+
25
+ import nltk
26
+ from rank_bm25 import BM25Okapi
27
+ from sklearn.feature_extraction import _stop_words
28
+ import string
29
+ from tqdm.autonotebook import tqdm
30
+ import numpy as np
31
+ import pandas as pd
32
+ from sentence_transformers import SentenceTransformer
33
+ import scipy.spatial
34
+ import pickle
35
+ from sentence_transformers import SentenceTransformer, util
36
+ import torch
37
+
38
+
39
+
40
+
41
+
42
+ # import utils as utl
43
+
44
+ import time
45
+ import torch
46
+ import transformers
47
+ from transformers import BartTokenizer, BartForConditionalGeneration
48
+ from string import punctuation
49
+ # tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
50
+
51
+ import numpy as np
52
+ import pandas as pd
53
+ from sentence_transformers import SentenceTransformer
54
+ import scipy.spatial
55
+
56
+
57
+ from sentence_transformers import SentenceTransformer, util
58
+ import torch
59
+
60
+
61
+
62
+ def main():
63
+
64
+
65
+
66
+
67
+ # Settings
68
+ st.set_page_config(layout="wide", page_title='Paris Hotel Finder', page_icon="🎈" )
69
+ from string import punctuation
70
+ punctuation=punctuation+ '\n'
71
+
72
+
73
+ from sentence_transformers import SentenceTransformer, util
74
+ import torch
75
+ import numpy as np
76
+ import pandas as pd
77
+ from sentence_transformers import SentenceTransformer
78
+ import scipy.spatial
79
+
80
+ from sentence_transformers import SentenceTransformer, util
81
+ import torch
82
+ #import os
83
+ @st.cache(allow_output_mutation=True)
84
+ def load_model():
85
+ return SentenceTransformer('all-MiniLM-L6-v2'),SentenceTransformer('multi-qa-MiniLM-L6-cos-v1'),CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
86
+ embedder,bi_encoder,cross_encoder = load_model()
87
+
88
+
89
+
90
+
91
+ #original_title = '<p style="font-family:IBM Mono; color:Blue; font-size: 20px;">Original image</p>'
92
+ st.title("Parisian Hotel Finder")
93
+ with st.expander("ℹ️ - About this app", expanded=True):
94
+
95
+ st.write(
96
+ """
97
+ - This app allows you to search for hotels based on what you're looking for, rather than just cities - it helps with reducing time to go through exhaustive reviews for each hotel!
98
+ - It uses an innovative semantic search approach that leverages multiple NLP embeddings and relies on [Transformers] (https://huggingface.co/transformers/) 🤗.
99
+ """
100
+ )
101
+
102
+
103
+ punctuation=punctuation+ '\n'
104
+
105
+
106
+ #import os
107
+
108
+ # embedder = SentenceTransformer('all-MiniLM-L6-v2')
109
+
110
+
111
+
112
+ def lower_case(input_str):
113
+ input_str = input_str.lower()
114
+ return input_str
115
+
116
+ df_all = pd.read_csv('paris_clean_newer.csv')
117
+
118
+
119
+ df_combined = df_all.sort_values(['Hotel']).groupby('Hotel', sort=False).text.apply(''.join).reset_index(name='all_review')
120
+ df_combined_paris_summary = pd.read_csv('df_combined_paris.csv')
121
+ df_combined_paris_summary = df_combined_paris_summary[['Hotel','summary']]
122
+
123
+ import re
124
+
125
+ # df_combined = pd.read_csv('df_combined.csv')
126
+
127
+ df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
128
+
129
+
130
+ df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x))
131
+ df_basic = df_all[['Hotel','description','price_per_night']].drop_duplicates()
132
+ df_basic = df_basic.merge(df_combined_paris_summary,how='left')
133
+ df_combined_e = df_combined.merge(df_basic)
134
+ df_combined_e['all_review'] =df_combined_e['description']+ df_combined_e['all_review'] + df_combined_e['price_per_night']
135
+
136
+ df = df_combined_e.copy()
137
+
138
+
139
+ df_sentences = df_combined_e.set_index("all_review")
140
+
141
+ df_sentences = df_sentences["Hotel"].to_dict()
142
+ df_sentences_list = list(df_sentences.keys())
143
+
144
+
145
+
146
+ import pandas as pd
147
+ from tqdm import tqdm
148
+ from sentence_transformers import SentenceTransformer, util
149
+
150
+ df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]
151
+ #
152
+ corpus = df_sentences_list
153
+ # corpus_embeddings = embedder.encode(corpus,show_progress_bar=True)
154
+ corpus_embeddings = np.load('embeddings.npy')
155
+
156
+ bi_encoder.max_seq_length = 512 #Truncate long passages to 256 tokens
157
+ top_k = 32 #Number of passages we want to retrieve with the bi-encoder
158
+
159
+ #The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
160
+
161
+ # corpus_embeddings_h = np.load('embeddings_h_r.npy')
162
+
163
+ with open('corpus_embeddings_bi_encoder.pickle', 'rb') as pkl:
164
+ doc_embedding = pickle.load(pkl)
165
+
166
+ with open('tokenized_corpus.pickle', 'rb') as pkl:
167
+ tokenized_corpus = pickle.load(pkl)
168
+
169
+ bm25 = BM25Okapi(tokenized_corpus)
170
+ passages = corpus
171
+
172
+
173
+
174
+
175
+ # We lower case our text and remove stop-words from indexing
176
+ def bm25_tokenizer(text):
177
+ tokenized_doc = []
178
+ for token in text.lower().split():
179
+ token = token.strip(string.punctuation)
180
+
181
+ if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
182
+ tokenized_doc.append(token)
183
+ return tokenized_doc
184
+
185
+
186
+ def search(query):
187
+ # q = [str(userinput)]
188
+ doc = nlp(str(userinput))
189
+
190
+ ent_html = displacy.render(doc, style="ent", jupyter=False)
191
+ # Display the entity visualization in the browser:
192
+ st.markdown(ent_html, unsafe_allow_html=True)
193
+ ##### BM25 search (lexical search) #####
194
+ bm25_scores = bm25.get_scores(bm25_tokenizer(query))
195
+ top_n = np.argpartition(bm25_scores, -5)[-5:]
196
+ bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
197
+ bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
198
+
199
+ bm25list = {}
200
+ st.title("Top-5 lexical search (BM25) hits")
201
+ for hit in bm25_hits[0:5]:
202
+ row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]]
203
+
204
+ st.subheader(row_dict['Hotel'].values[0])
205
+ de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]]
206
+ st.write(f'\tPrice Per night: {de.price_per_night.values[0]}')
207
+ st.write('Description:')
208
+ st.expander(de.description.values[0],expanded=False)
209
+ # try:
210
+ # st.write('Summary')
211
+ # st.expander(de.summary.values[0],expanded=False)
212
+ # except:
213
+ # None
214
+ # doc = corpus[hit['corpus_id']]
215
+ # kp.get_key_phrases(doc)
216
+
217
+ bm25list[row_dict['Hotel'].values[0]] = de.description.values[0][0:200]
218
+
219
+ #### Sematic Search #####
220
+ # Encode the query using the bi-encoder and find potentially relevant passages
221
+ question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
222
+ # question_embedding = question_embedding.cuda()
223
+ hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
224
+ hits = hits[0] # Get the hits for the first query
225
+
226
+ ##### Re-Ranking #####
227
+ # Now, score all retrieved passages with the cross_encoder
228
+ cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
229
+ cross_scores = cross_encoder.predict(cross_inp)
230
+
231
+ # Sort results by the cross-encoder scores
232
+ for idx in range(len(cross_scores)):
233
+ hits[idx]['cross-score'] = cross_scores[idx]
234
+
235
+ # Output of top-5 hits from bi-encoder
236
+ st.write("\n-------------------------\n")
237
+ st.title("Top-5 Bi-Encoder Retrieval hits")
238
+ hits = sorted(hits, key=lambda x: x['score'], reverse=True)
239
+ for hit in hits[0:5]:
240
+ # st.write("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))
241
+ row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]]
242
+ st.subheader(row_dict['Hotel'].values[0])
243
+ de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]]
244
+ st.write(f'\tPrice Per night: {de.price_per_night.values[0]}')
245
+ st.write('Description:')
246
+ st.expander(de.description.values[0])
247
+ # try:
248
+ # st.write('Summary')
249
+ # st.expander(de.summary.values[0],expanded=False)
250
+ # except:
251
+ # None
252
+
253
+ # Output of top-5 hits from re-ranker
254
+ st.write("\n-------------------------\n")
255
+ st.title("Top-5 Cross-Encoder Re-ranker hits")
256
+ hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
257
+ for hit in hits[0:5]:
258
+ # st.write("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))
259
+ row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]]
260
+ st.subheader(row_dict['Hotel'].values[0])
261
+ de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]]
262
+ st.write(f'\tPrice Per night: {de.price_per_night.values[0]}')
263
+ st.write('Description:')
264
+ st.expander(de.description.values[0])
265
+ # try:
266
+ # st.write('Summary')
267
+ # st.expander(de.summary.values[0],expanded=False)
268
+ # except:
269
+ # None
270
+
271
+
272
+
273
+
274
+ sampletext = 'e.g. Hotel near Eiffel Tower with big rooms'
275
+ userinput = st.text_input('Tell us what are you looking in your hotel?','e.g. Hotel near Eiffel Tower with big rooms',autocomplete="on")
276
+ da = st.date_input(
277
+ "Date Check-in",
278
+ datetime.date(2022, 10, 5))
279
+
280
+ dst = st.date_input(
281
+ "Date Check-out",
282
+ datetime.date(2022, 10, 8))
283
+
284
+
285
+ if not userinput or userinput == sampletext:
286
+ st.write("Please enter a query to get results")
287
+ else:
288
+ query = [str(userinput)]
289
+ doc = nlp(str(userinput))
290
+ search(str(userinput))
291
+
292
+ # We use cosine-similarity and torch.topk to find the highest 5 scores
293
+
294
+ if __name__ == '__main__':
295
+ main()
paris.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+ """
4
+
5
+
6
+ @author: Hamza Farooq
7
+ """
8
+
9
+ import spacy
10
+ from spacy.lang.en.stop_words import STOP_WORDS
11
+ from string import punctuation
12
+ from collections import Counter
13
+ from heapq import nlargest
14
+ import os
15
+ nlp = spacy.load("en_core_web_sm")
16
+
17
+ from spacy import displacy
18
+ import streamlit as st
19
+ import matplotlib.pyplot as plt
20
+ from wordcloud import WordCloud
21
+ from matplotlib import pyplot as plt
22
+
23
+ import nltk
24
+
25
+
26
+
27
+
28
+
29
+ # import utils as utl
30
+
31
+ import time
32
+ import torch
33
+ import transformers
34
+ from transformers import BartTokenizer, BartForConditionalGeneration
35
+ from string import punctuation
36
+ # tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
37
+
38
+ import numpy as np
39
+ import pandas as pd
40
+ from sentence_transformers import SentenceTransformer
41
+ import scipy.spatial
42
+ import pickle as pkl
43
+ from sentence_transformers import SentenceTransformer, util
44
+ import torch
45
+
46
+
47
+
48
+ def main():
49
+
50
+
51
+
52
+
53
+ # Settings
54
+ st.set_page_config(layout="wide", page_title='Paris Hotel Finder', page_icon="🎈" )
55
+ from string import punctuation
56
+ punctuation=punctuation+ '\n'
57
+
58
+ # def bart_summarize(text, num_beams=20, length_penalty=2, max_length=2048, min_length=56, no_repeat_ngram_size=2):
59
+ #
60
+ # text = text.replace('\n','')
61
+ # text_input_ids = tr.batch_encode_plus([text], return_tensors='pt', max_length=1024)['input_ids'].to(torch_device)
62
+ # summary_ids = mdl.generate(text_input_ids, num_beams=int(num_beams), length_penalty=float(length_penalty), max_length=int(max_length), min_length=int(min_length), no_repeat_ngram_size=int(no_repeat_ngram_size))
63
+ # summary_txt = tr.decode(summary_ids.squeeze(), skip_special_tokens=True)
64
+ # return summary_txt
65
+
66
+ from sentence_transformers import SentenceTransformer, util
67
+ import torch
68
+ import numpy as np
69
+ import pandas as pd
70
+ from sentence_transformers import SentenceTransformer
71
+ import scipy.spatial
72
+ import pickle as pkl
73
+ from sentence_transformers import SentenceTransformer, util
74
+ import torch
75
+ #import os
76
+ @st.cache(allow_output_mutation=True)
77
+ def load_model():
78
+ return SentenceTransformer('all-MiniLM-L6-v2')
79
+ embedder = load_model()
80
+ # embedder = SentenceTransformer('all-MiniLM-L6-v2')
81
+
82
+ # gc = geonamescache.GeonamesCache()
83
+ #
84
+ # # gets nested dictionary for countries
85
+ # countries = gc.get_countries()
86
+ #
87
+ # # gets nested dictionary for cities
88
+ # cities = gc.get_cities()
89
+ # def gen_dict_extract(var, key):
90
+ # if isinstance(var, dict):
91
+ # for k, v in var.items():
92
+ # if k == key:
93
+ # yield v
94
+ # if isinstance(v, (dict, list)):
95
+ # yield from gen_dict_extract(v, key)
96
+ # elif isinstance(var, list):
97
+ # for d in var:
98
+ # yield from gen_dict_extract(d, key)
99
+ #
100
+ # cities = [*gen_dict_extract(cities, 'name')]
101
+ # countries = [*gen_dict_extract(countries, 'name')]
102
+ #
103
+ # cities.append('New York')
104
+
105
+
106
+
107
+
108
+ # mask = np.array(Image.open('upvote.png'))
109
+
110
+
111
+ #original_title = '<p style="font-family:IBM Mono; color:Blue; font-size: 20px;">Original image</p>'
112
+ st.title("Parisian Hotel Finder")
113
+ with st.expander("ℹ️ - About this app", expanded=True):
114
+
115
+ st.write(
116
+ """
117
+ - This app allows you to search for hotels based on what you're looking for, rather than just cities - it helps with reducing time to go through exhaustive reviews for each hotel!
118
+ - It uses an innovative semantic search approach that leverages multiple NLP embeddings and relies on [Transformers] (https://huggingface.co/transformers/) 🤗.
119
+ """
120
+ )
121
+
122
+
123
+ punctuation=punctuation+ '\n'
124
+
125
+
126
+ #import os
127
+
128
+ # embedder = SentenceTransformer('all-MiniLM-L6-v2')
129
+
130
+ df_all = pd.read_csv('combined_paris.csv')
131
+
132
+ df_all = df_all[['Hotel','review']]
133
+
134
+
135
+ df_all = df_all.drop_duplicates()
136
+ df_all = df_all.reset_index(drop=True)
137
+ summary_hotel = pd.read_csv('df_combined_paris.csv')
138
+ #
139
+ # df['hotel_name'].drop_duplicates()
140
+
141
+ df_combined = df_all.sort_values(['Hotel']).groupby('Hotel', sort=False).review.apply(''.join).reset_index(name='all_review')
142
+
143
+ import re
144
+
145
+ # df_combined = pd.read_csv('df_combined.csv')
146
+
147
+ df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
148
+
149
+ def lower_case(input_str):
150
+ input_str = input_str.lower()
151
+ return input_str
152
+
153
+ df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x))
154
+
155
+ df = df_combined
156
+
157
+ df_sentences = df_combined.set_index("all_review")
158
+
159
+ df_sentences = df_sentences["Hotel"].to_dict()
160
+ df_sentences_list = list(df_sentences.keys())
161
+
162
+ import pandas as pd
163
+ from tqdm import tqdm
164
+ from sentence_transformers import SentenceTransformer, util
165
+
166
+ df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]
167
+ #
168
+ corpus = df_sentences_list
169
+ # corpus_embeddings = embedder.encode(corpus,show_progress_bar=True)
170
+ corpus_embeddings = np.load('embeddings_review.npy')
171
+ corpus_embeddings_h = np.load('embeddings_h_r.npy')
172
+ #
173
+ # model = SentenceTransformer('all-MiniLM-L6-v2')
174
+ # paraphrases = util.paraphrase_mining(model, corpus)
175
+
176
+ #queries = ['Hotel close to Central Park',
177
+ # 'Hotel with breakfast'
178
+ # ]
179
+
180
+
181
+ # from transformers import AutoTokenizer, AutoModel
182
+ # import torch
183
+ # import torch.nn.functional as F
184
+ #
185
+ # #Mean Pooling - Take attention mask into account for correct averaging
186
+ # def mean_pooling(model_output, attention_mask):
187
+ # token_embeddings = model_output[0] #First element of model_output contains all token embeddings
188
+ # input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
189
+ # return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
190
+ #
191
+ #
192
+ # # Sentences we want sentence embeddings for
193
+ # sentences = corpus
194
+ #
195
+ # # Load model from HuggingFace Hub
196
+ # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v1')
197
+ # model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v1')
198
+ #
199
+ # # Tokenize sentences
200
+ # encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
201
+ #
202
+ # # Compute token embeddings
203
+ # with torch.no_grad():
204
+ # model_output = model(**encoded_input)
205
+ #
206
+ # # Perform pooling
207
+ # sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
208
+ #
209
+ # # Normalize embeddings
210
+ # sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
211
+ #
212
+ # st.text("Sentence embeddings:")
213
+ # st.text(sentence_embeddings)
214
+ #
215
+ #
216
+
217
+ #corpus_embeddings = sentence_embeddings
218
+ # Query sentences
219
+
220
+ def plot_cloud(wordcloud):
221
+ # Set figure size
222
+ st.pyplot.figure(figsize=(20, 10))
223
+ # Display image
224
+ st.pyplot(wordcloud)
225
+ # No axis details
226
+ #st.pyplot.axis("off");
227
+ sampletext = 'e.g. Hotel near Eiffel Tower with big rooms'
228
+ userinput = st.text_input('Tell us what are you looking in your hotel?','e.g. Hotel near Eiffel Tower with big rooms',autocomplete="on")
229
+ if not userinput or userinput == sampletext:
230
+ st.write("Please enter a query to get results")
231
+ else:
232
+ query = [str(userinput)]
233
+ doc = nlp(str(userinput))
234
+ # for ent in doc.ents:
235
+ # if ent.label_ == 'GPE':
236
+ # if ent.text in countries:
237
+ # st.write(f"Country : {ent.text}")
238
+ # elif ent.text in cities:
239
+ # st.write("city")
240
+ # st.write(ent.text)
241
+ # st.write(f"City : {ent.text}")
242
+ # else:
243
+ # print(f"Other GPE : {ent.text}")
244
+ # query_embeddings = embedder.encode(queries,show_progress_bar=True)
245
+ top_k = min(5, len(corpus))
246
+
247
+ query_embedding = embedder.encode(query, convert_to_tensor=True)
248
+
249
+ # We use cosine-similarity and torch.topk to find the highest 5 scores
250
+ cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
251
+ top_results = torch.topk(cos_scores, k=top_k)
252
+
253
+ # st.write("\n\n======================\n\n")
254
+ # st.write("Query:", query)
255
+ # # doc = nlp(query)
256
+ sentence_spans = list(doc.sents)
257
+ ent_html = displacy.render(doc, style="ent", jupyter=False)
258
+ # Display the entity visualization in the browser:
259
+ st.markdown(ent_html, unsafe_allow_html=True)
260
+
261
+ #displacy.render(doc, jupyter = True, style="ent")
262
+ st.write("##")
263
+ st.subheader("\n\n\n\n\n\nTop 5 most relevant hotels:\n\n\n\n\n\n\n")
264
+ st.write("\n\n======================\n\n")
265
+
266
+ for score, idx in zip(top_results[0], top_results[1]):
267
+
268
+ row_dict = df.loc[df['all_review']== corpus[idx]]
269
+ st.subheader(row_dict['Hotel'].values[0])
270
+
271
+ hotel_subset = df_all.loc[df_all['Hotel']==row_dict['Hotel'].values[0]]
272
+ hotel_sub = summary_hotel.loc[summary_hotel['Hotel']==row_dict['Hotel'].values[0]]
273
+ st.caption("Review Summary:")
274
+ st.write(hotel_sub['summary'].values[0])
275
+ st.caption("Relevancy: {:.4f}".format(score))
276
+ st.caption("Relevant reviews:")
277
+
278
+ df_sentences_h = hotel_subset.set_index("review")
279
+
280
+ df_sentences_h = df_sentences_h["Hotel"].to_dict()
281
+ df_sentences_list_h = list(df_sentences_h.keys())
282
+
283
+
284
+
285
+ df_sentences_list_h = [str(d) for d in tqdm(df_sentences_list_h)]
286
+ #
287
+ corpus_h = df_sentences_list_h
288
+ # corpus_embeddings_h = embedder.encode(corpus_h,show_progress_bar=True)
289
+ sublist = [element for i, element in enumerate(corpus_embeddings_h) if i in (df_all[df_all['Hotel'] == row_dict['Hotel'].values[0]].index.values)]
290
+ cos_scores_h = util.pytorch_cos_sim(query_embedding, sublist)[0]
291
+ top_results_h = torch.topk(cos_scores_h, k=top_k)
292
+
293
+ for score, idx in zip(top_results_h[0], top_results_h[1]):
294
+ st.write(corpus_h[idx])
295
+
296
+
297
+ if __name__ == '__main__':
298
+ main()
paris_clean_newer.csv ADDED
The diff for this file is too large to render. See raw diff
 
query_generator.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ pandas
2
+ streamlit==1.1.0
3
+ regex==2021.8.3
4
+ sklearn
5
+ sentence_transformers
6
+ scipy
7
+ tqdm
8
+ gensim
9
+ plotly
10
+ wordcloud
11
+ matplotlib
12
+ spacy
13
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl
14
+ rank-bm25
summary.ipynb ADDED
@@ -0,0 +1,654 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import torch\n",
10
+ "import transformers\n",
11
+ "from transformers import BartTokenizer, BartForConditionalGeneration\n",
12
+ "tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn')\n",
13
+ "mdl = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')\n",
14
+ "torch_device = 'cpu'\n"
15
+ ]
16
+ },
17
+ {
18
+ "cell_type": "code",
19
+ "execution_count": 3,
20
+ "metadata": {},
21
+ "outputs": [],
22
+ "source": [
23
+ "def bart_summarize(text, num_beams=20, length_penalty=2, max_length=2048, min_length=56, no_repeat_ngram_size=2):\n",
24
+ "\n",
25
+ " text = text.replace('\\n','')\n",
26
+ " text_input_ids = tr.batch_encode_plus([text], return_tensors='pt', max_length=1024)['input_ids'].to(torch_device)\n",
27
+ " summary_ids = mdl.generate(text_input_ids, num_beams=int(num_beams), length_penalty=float(length_penalty), max_length=int(max_length), min_length=int(min_length), no_repeat_ngram_size=int(no_repeat_ngram_size))\n",
28
+ " summary_txt = tr.decode(summary_ids.squeeze(), skip_special_tokens=True)\n",
29
+ " return summary_txt"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 4,
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "import pandas as pd\n",
39
+ "from sentence_transformers import SentenceTransformer\n",
40
+ "import scipy.spatial\n",
41
+ "import pickle as pkl\n",
42
+ "from sentence_transformers import SentenceTransformer, util\n",
43
+ "import torch\n",
44
+ "#import os\n",
45
+ "\n",
46
+ "\n",
47
+ "df = pd.read_csv('combined_paris.csv')\n",
48
+ "\n",
49
+ "\n",
50
+ "df_combined = df.sort_values(['Hotel']).groupby('Hotel', sort=False).review.apply(''.join).reset_index(name='all_review')\n",
51
+ "\n",
52
+ "import re\n",
53
+ "\n",
54
+ "df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\\s]','',x))\n",
55
+ "def lower_case(input_str):\n",
56
+ " input_str = input_str.lower()\n",
57
+ " return input_str"
58
+ ]
59
+ },
60
+ {
61
+ "cell_type": "code",
62
+ "execution_count": 5,
63
+ "metadata": {},
64
+ "outputs": [],
65
+ "source": [
66
+ "df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x))\n",
67
+ "\n",
68
+ "df = df_combined\n",
69
+ "\n",
70
+ "df_sentences = df_combined.set_index(\"all_review\")\n",
71
+ "\n",
72
+ "df_sentences = df_sentences[\"Hotel\"].to_dict()\n",
73
+ "df_sentences_list = list(df_sentences.keys())\n"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": 6,
79
+ "metadata": {},
80
+ "outputs": [
81
+ {
82
+ "data": {
83
+ "text/html": [
84
+ "<div>\n",
85
+ "<style scoped>\n",
86
+ " .dataframe tbody tr th:only-of-type {\n",
87
+ " vertical-align: middle;\n",
88
+ " }\n",
89
+ "\n",
90
+ " .dataframe tbody tr th {\n",
91
+ " vertical-align: top;\n",
92
+ " }\n",
93
+ "\n",
94
+ " .dataframe thead th {\n",
95
+ " text-align: right;\n",
96
+ " }\n",
97
+ "</style>\n",
98
+ "<table border=\"1\" class=\"dataframe\">\n",
99
+ " <thead>\n",
100
+ " <tr style=\"text-align: right;\">\n",
101
+ " <th></th>\n",
102
+ " <th>0</th>\n",
103
+ " <th>1</th>\n",
104
+ " <th>2</th>\n",
105
+ " <th>3</th>\n",
106
+ " <th>4</th>\n",
107
+ " </tr>\n",
108
+ " </thead>\n",
109
+ " <tbody>\n",
110
+ " <tr>\n",
111
+ " <th>Hotel</th>\n",
112
+ " <td>25hours Hotel Terminus Nord</td>\n",
113
+ " <td>Acacias Etoile Hotel</td>\n",
114
+ " <td>COQ Hotel Paris</td>\n",
115
+ " <td>Campanile Paris 14 - Maine Montparnasse</td>\n",
116
+ " <td>Cler Hotel</td>\n",
117
+ " </tr>\n",
118
+ " <tr>\n",
119
+ " <th>all_review</th>\n",
120
+ " <td>weve spent lots of time in paris and this was ...</td>\n",
121
+ " <td>the hotel is great for value the breakfast sel...</td>\n",
122
+ " <td>stayed for a short city break the hotel is a ...</td>\n",
123
+ " <td>room was very clean transportation is very ne...</td>\n",
124
+ " <td>we had the best stay at cler hotel the locati...</td>\n",
125
+ " </tr>\n",
126
+ " </tbody>\n",
127
+ "</table>\n",
128
+ "</div>"
129
+ ],
130
+ "text/plain": [
131
+ " 0 \\\n",
132
+ "Hotel 25hours Hotel Terminus Nord \n",
133
+ "all_review weve spent lots of time in paris and this was ... \n",
134
+ "\n",
135
+ " 1 \\\n",
136
+ "Hotel Acacias Etoile Hotel \n",
137
+ "all_review the hotel is great for value the breakfast sel... \n",
138
+ "\n",
139
+ " 2 \\\n",
140
+ "Hotel COQ Hotel Paris \n",
141
+ "all_review stayed for a short city break the hotel is a ... \n",
142
+ "\n",
143
+ " 3 \\\n",
144
+ "Hotel Campanile Paris 14 - Maine Montparnasse \n",
145
+ "all_review room was very clean transportation is very ne... \n",
146
+ "\n",
147
+ " 4 \n",
148
+ "Hotel Cler Hotel \n",
149
+ "all_review we had the best stay at cler hotel the locati... "
150
+ ]
151
+ },
152
+ "execution_count": 6,
153
+ "metadata": {},
154
+ "output_type": "execute_result"
155
+ }
156
+ ],
157
+ "source": [
158
+ "df_combined.head().T"
159
+ ]
160
+ },
161
+ {
162
+ "cell_type": "code",
163
+ "execution_count": 7,
164
+ "metadata": {},
165
+ "outputs": [
166
+ {
167
+ "name": "stderr",
168
+ "output_type": "stream",
169
+ "text": [
170
+ "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
171
+ ]
172
+ }
173
+ ],
174
+ "source": [
175
+ "long_summary = []\n",
176
+ "\n",
177
+ "for i in range(len(df_combined)):\n",
178
+ " t = bart_summarize(df_combined['all_review'][i])\n",
179
+ " long_summary.append(t)"
180
+ ]
181
+ },
182
+ {
183
+ "cell_type": "code",
184
+ "execution_count": 8,
185
+ "metadata": {},
186
+ "outputs": [],
187
+ "source": [
188
+ "df_combined['summary'] = long_summary"
189
+ ]
190
+ },
191
+ {
192
+ "cell_type": "code",
193
+ "execution_count": 9,
194
+ "metadata": {},
195
+ "outputs": [],
196
+ "source": [
197
+ "df_combined.to_csv('df_combined_paris.csv',index=False)"
198
+ ]
199
+ },
200
+ {
201
+ "cell_type": "code",
202
+ "execution_count": 10,
203
+ "metadata": {},
204
+ "outputs": [
205
+ {
206
+ "data": {
207
+ "text/html": [
208
+ "<div>\n",
209
+ "<style scoped>\n",
210
+ " .dataframe tbody tr th:only-of-type {\n",
211
+ " vertical-align: middle;\n",
212
+ " }\n",
213
+ "\n",
214
+ " .dataframe tbody tr th {\n",
215
+ " vertical-align: top;\n",
216
+ " }\n",
217
+ "\n",
218
+ " .dataframe thead th {\n",
219
+ " text-align: right;\n",
220
+ " }\n",
221
+ "</style>\n",
222
+ "<table border=\"1\" class=\"dataframe\">\n",
223
+ " <thead>\n",
224
+ " <tr style=\"text-align: right;\">\n",
225
+ " <th></th>\n",
226
+ " <th>Hotel</th>\n",
227
+ " <th>all_review</th>\n",
228
+ " <th>summary</th>\n",
229
+ " </tr>\n",
230
+ " </thead>\n",
231
+ " <tbody>\n",
232
+ " <tr>\n",
233
+ " <th>0</th>\n",
234
+ " <td>25hours Hotel Terminus Nord</td>\n",
235
+ " <td>weve spent lots of time in paris and this was ...</td>\n",
236
+ " <td>we were blown away by this excellent hotel we ...</td>\n",
237
+ " </tr>\n",
238
+ " <tr>\n",
239
+ " <th>1</th>\n",
240
+ " <td>Acacias Etoile Hotel</td>\n",
241
+ " <td>the hotel is great for value the breakfast sel...</td>\n",
242
+ " <td>The hotel is great for value the breakfast sel...</td>\n",
243
+ " </tr>\n",
244
+ " <tr>\n",
245
+ " <th>2</th>\n",
246
+ " <td>COQ Hotel Paris</td>\n",
247
+ " <td>stayed for a short city break the hotel is a ...</td>\n",
248
+ " <td>stayed for a short city break the hotel is a ...</td>\n",
249
+ " </tr>\n",
250
+ " <tr>\n",
251
+ " <th>3</th>\n",
252
+ " <td>Campanile Paris 14 - Maine Montparnasse</td>\n",
253
+ " <td>room was very clean transportation is very ne...</td>\n",
254
+ " <td>hotel turned out to be perfect for our short ...</td>\n",
255
+ " </tr>\n",
256
+ " <tr>\n",
257
+ " <th>4</th>\n",
258
+ " <td>Cler Hotel</td>\n",
259
+ " <td>we had the best stay at cler hotel the locati...</td>\n",
260
+ " <td>we had the best stay at cler hotel the locati...</td>\n",
261
+ " </tr>\n",
262
+ " </tbody>\n",
263
+ "</table>\n",
264
+ "</div>"
265
+ ],
266
+ "text/plain": [
267
+ " Hotel \\\n",
268
+ "0 25hours Hotel Terminus Nord \n",
269
+ "1 Acacias Etoile Hotel \n",
270
+ "2 COQ Hotel Paris \n",
271
+ "3 Campanile Paris 14 - Maine Montparnasse \n",
272
+ "4 Cler Hotel \n",
273
+ "\n",
274
+ " all_review \\\n",
275
+ "0 weve spent lots of time in paris and this was ... \n",
276
+ "1 the hotel is great for value the breakfast sel... \n",
277
+ "2 stayed for a short city break the hotel is a ... \n",
278
+ "3 room was very clean transportation is very ne... \n",
279
+ "4 we had the best stay at cler hotel the locati... \n",
280
+ "\n",
281
+ " summary \n",
282
+ "0 we were blown away by this excellent hotel we ... \n",
283
+ "1 The hotel is great for value the breakfast sel... \n",
284
+ "2 stayed for a short city break the hotel is a ... \n",
285
+ "3 hotel turned out to be perfect for our short ... \n",
286
+ "4 we had the best stay at cler hotel the locati... "
287
+ ]
288
+ },
289
+ "execution_count": 10,
290
+ "metadata": {},
291
+ "output_type": "execute_result"
292
+ }
293
+ ],
294
+ "source": [
295
+ "df_combined.head()"
296
+ ]
297
+ },
298
+ {
299
+ "cell_type": "code",
300
+ "execution_count": null,
301
+ "metadata": {},
302
+ "outputs": [],
303
+ "source": []
304
+ },
305
+ {
306
+ "cell_type": "code",
307
+ "execution_count": 3,
308
+ "metadata": {},
309
+ "outputs": [
310
+ {
311
+ "name": "stdout",
312
+ "output_type": "stream",
313
+ "text": [
314
+ "Dockerfile df_combined.csv\n",
315
+ "Hotel New York Combined.csv en_core_web_sm-3.2.0-py3-none-any.whl\n",
316
+ "README.md query_generator.ipynb\n",
317
+ "Untitled.ipynb requirements.txt\n",
318
+ "app.py summary.ipynb\n",
319
+ "app.yaml\n"
320
+ ]
321
+ }
322
+ ],
323
+ "source": [
324
+ "!ls"
325
+ ]
326
+ },
327
+ {
328
+ "cell_type": "code",
329
+ "execution_count": 3,
330
+ "metadata": {},
331
+ "outputs": [
332
+ {
333
+ "name": "stderr",
334
+ "output_type": "stream",
335
+ "text": [
336
+ "/Users/aimzlicious/miniforge3/envs/tf_m1/lib/python3.8/site-packages/huggingface_hub/snapshot_download.py:6: FutureWarning: snapshot_download.py has been made private and will no longer be available from version 0.11. Please use `from huggingface_hub import snapshot_download` to import the only public function in this module. Other members of the file may be changed without a deprecation notice.\n",
337
+ " warnings.warn(\n"
338
+ ]
339
+ }
340
+ ],
341
+ "source": [
342
+ "import pandas as pd\n",
343
+ "from sentence_transformers import SentenceTransformer\n",
344
+ "import scipy.spatial\n",
345
+ "import pickle as pkl\n",
346
+ "from sentence_transformers import SentenceTransformer, util\n",
347
+ "import torch\n",
348
+ "df_combined_paris_summary = pd.read_csv('df_combined_paris.csv')"
349
+ ]
350
+ },
351
+ {
352
+ "cell_type": "code",
353
+ "execution_count": 4,
354
+ "metadata": {},
355
+ "outputs": [
356
+ {
357
+ "data": {
358
+ "text/html": [
359
+ "<div>\n",
360
+ "<style scoped>\n",
361
+ " .dataframe tbody tr th:only-of-type {\n",
362
+ " vertical-align: middle;\n",
363
+ " }\n",
364
+ "\n",
365
+ " .dataframe tbody tr th {\n",
366
+ " vertical-align: top;\n",
367
+ " }\n",
368
+ "\n",
369
+ " .dataframe thead th {\n",
370
+ " text-align: right;\n",
371
+ " }\n",
372
+ "</style>\n",
373
+ "<table border=\"1\" class=\"dataframe\">\n",
374
+ " <thead>\n",
375
+ " <tr style=\"text-align: right;\">\n",
376
+ " <th></th>\n",
377
+ " <th>Hotel</th>\n",
378
+ " <th>all_review</th>\n",
379
+ " <th>summary</th>\n",
380
+ " </tr>\n",
381
+ " </thead>\n",
382
+ " <tbody>\n",
383
+ " <tr>\n",
384
+ " <th>0</th>\n",
385
+ " <td>25hours Hotel Terminus Nord</td>\n",
386
+ " <td>weve spent lots of time in paris and this was ...</td>\n",
387
+ " <td>we were blown away by this excellent hotel we ...</td>\n",
388
+ " </tr>\n",
389
+ " <tr>\n",
390
+ " <th>1</th>\n",
391
+ " <td>Acacias Etoile Hotel</td>\n",
392
+ " <td>the hotel is great for value the breakfast sel...</td>\n",
393
+ " <td>The hotel is great for value the breakfast sel...</td>\n",
394
+ " </tr>\n",
395
+ " <tr>\n",
396
+ " <th>2</th>\n",
397
+ " <td>COQ Hotel Paris</td>\n",
398
+ " <td>stayed for a short city break the hotel is a ...</td>\n",
399
+ " <td>stayed for a short city break the hotel is a ...</td>\n",
400
+ " </tr>\n",
401
+ " <tr>\n",
402
+ " <th>3</th>\n",
403
+ " <td>Campanile Paris 14 - Maine Montparnasse</td>\n",
404
+ " <td>room was very clean transportation is very ne...</td>\n",
405
+ " <td>hotel turned out to be perfect for our short ...</td>\n",
406
+ " </tr>\n",
407
+ " <tr>\n",
408
+ " <th>4</th>\n",
409
+ " <td>Cler Hotel</td>\n",
410
+ " <td>we had the best stay at cler hotel the locati...</td>\n",
411
+ " <td>we had the best stay at cler hotel the locati...</td>\n",
412
+ " </tr>\n",
413
+ " </tbody>\n",
414
+ "</table>\n",
415
+ "</div>"
416
+ ],
417
+ "text/plain": [
418
+ " Hotel \\\n",
419
+ "0 25hours Hotel Terminus Nord \n",
420
+ "1 Acacias Etoile Hotel \n",
421
+ "2 COQ Hotel Paris \n",
422
+ "3 Campanile Paris 14 - Maine Montparnasse \n",
423
+ "4 Cler Hotel \n",
424
+ "\n",
425
+ " all_review \\\n",
426
+ "0 weve spent lots of time in paris and this was ... \n",
427
+ "1 the hotel is great for value the breakfast sel... \n",
428
+ "2 stayed for a short city break the hotel is a ... \n",
429
+ "3 room was very clean transportation is very ne... \n",
430
+ "4 we had the best stay at cler hotel the locati... \n",
431
+ "\n",
432
+ " summary \n",
433
+ "0 we were blown away by this excellent hotel we ... \n",
434
+ "1 The hotel is great for value the breakfast sel... \n",
435
+ "2 stayed for a short city break the hotel is a ... \n",
436
+ "3 hotel turned out to be perfect for our short ... \n",
437
+ "4 we had the best stay at cler hotel the locati... "
438
+ ]
439
+ },
440
+ "execution_count": 4,
441
+ "metadata": {},
442
+ "output_type": "execute_result"
443
+ }
444
+ ],
445
+ "source": [
446
+ "df_combined_paris.head()"
447
+ ]
448
+ },
449
+ {
450
+ "cell_type": "code",
451
+ "execution_count": 5,
452
+ "metadata": {},
453
+ "outputs": [],
454
+ "source": [
455
+ "df_paris = pd.read_csv('paris_clean_newer.csv')"
456
+ ]
457
+ },
458
+ {
459
+ "cell_type": "code",
460
+ "execution_count": 9,
461
+ "metadata": {},
462
+ "outputs": [],
463
+ "source": [
464
+ "hotel=pd.DataFrame(df_paris['Hotel'].drop_duplicates())"
465
+ ]
466
+ },
467
+ {
468
+ "cell_type": "code",
469
+ "execution_count": 11,
470
+ "metadata": {},
471
+ "outputs": [
472
+ {
473
+ "data": {
474
+ "text/html": [
475
+ "<div>\n",
476
+ "<style scoped>\n",
477
+ " .dataframe tbody tr th:only-of-type {\n",
478
+ " vertical-align: middle;\n",
479
+ " }\n",
480
+ "\n",
481
+ " .dataframe tbody tr th {\n",
482
+ " vertical-align: top;\n",
483
+ " }\n",
484
+ "\n",
485
+ " .dataframe thead th {\n",
486
+ " text-align: right;\n",
487
+ " }\n",
488
+ "</style>\n",
489
+ "<table border=\"1\" class=\"dataframe\">\n",
490
+ " <thead>\n",
491
+ " <tr style=\"text-align: right;\">\n",
492
+ " <th></th>\n",
493
+ " <th>Hotel</th>\n",
494
+ " <th>all_review</th>\n",
495
+ " <th>summary</th>\n",
496
+ " </tr>\n",
497
+ " </thead>\n",
498
+ " <tbody>\n",
499
+ " <tr>\n",
500
+ " <th>0</th>\n",
501
+ " <td>25hours Hotel Terminus Nord</td>\n",
502
+ " <td>weve spent lots of time in paris and this was ...</td>\n",
503
+ " <td>we were blown away by this excellent hotel we ...</td>\n",
504
+ " </tr>\n",
505
+ " <tr>\n",
506
+ " <th>1</th>\n",
507
+ " <td>Acacias Etoile Hotel</td>\n",
508
+ " <td>the hotel is great for value the breakfast sel...</td>\n",
509
+ " <td>The hotel is great for value the breakfast sel...</td>\n",
510
+ " </tr>\n",
511
+ " <tr>\n",
512
+ " <th>2</th>\n",
513
+ " <td>COQ Hotel Paris</td>\n",
514
+ " <td>stayed for a short city break the hotel is a ...</td>\n",
515
+ " <td>stayed for a short city break the hotel is a ...</td>\n",
516
+ " </tr>\n",
517
+ " <tr>\n",
518
+ " <th>3</th>\n",
519
+ " <td>Campanile Paris 14 - Maine Montparnasse</td>\n",
520
+ " <td>room was very clean transportation is very ne...</td>\n",
521
+ " <td>hotel turned out to be perfect for our short ...</td>\n",
522
+ " </tr>\n",
523
+ " <tr>\n",
524
+ " <th>4</th>\n",
525
+ " <td>Cler Hotel</td>\n",
526
+ " <td>we had the best stay at cler hotel the locati...</td>\n",
527
+ " <td>we had the best stay at cler hotel the locati...</td>\n",
528
+ " </tr>\n",
529
+ " <tr>\n",
530
+ " <th>...</th>\n",
531
+ " <td>...</td>\n",
532
+ " <td>...</td>\n",
533
+ " <td>...</td>\n",
534
+ " </tr>\n",
535
+ " <tr>\n",
536
+ " <th>89</th>\n",
537
+ " <td>Sofitel Paris Le Faubourg</td>\n",
538
+ " <td>4 years ago i was the last time at sofitel le ...</td>\n",
539
+ " <td>4 years ago i was the last time at sofitel le ...</td>\n",
540
+ " </tr>\n",
541
+ " <tr>\n",
542
+ " <th>90</th>\n",
543
+ " <td>St Christopher's Gare du Nord Paris</td>\n",
544
+ " <td>when arriving to the area it felt a little dan...</td>\n",
545
+ " <td>Barry is the best bartender in paris cheers gr...</td>\n",
546
+ " </tr>\n",
547
+ " <tr>\n",
548
+ " <th>91</th>\n",
549
+ " <td>St Christopher's Inn Canal Paris</td>\n",
550
+ " <td>ive stayed at st christopher inn canal in pari...</td>\n",
551
+ " <td>ive stayed at st christopher inn canal in pari...</td>\n",
552
+ " </tr>\n",
553
+ " <tr>\n",
554
+ " <th>92</th>\n",
555
+ " <td>Touring Hotel</td>\n",
556
+ " <td>hotel is in a great location minutes walk fro...</td>\n",
557
+ " <td>Hotel is in a great location minutes walk fro...</td>\n",
558
+ " </tr>\n",
559
+ " <tr>\n",
560
+ " <th>93</th>\n",
561
+ " <td>Warwick Paris</td>\n",
562
+ " <td>if i know of anybody heading to paris i will r...</td>\n",
563
+ " <td>warwick hotel in paris is a good hotel to stay...</td>\n",
564
+ " </tr>\n",
565
+ " </tbody>\n",
566
+ "</table>\n",
567
+ "<p>94 rows × 3 columns</p>\n",
568
+ "</div>"
569
+ ],
570
+ "text/plain": [
571
+ " Hotel \\\n",
572
+ "0 25hours Hotel Terminus Nord \n",
573
+ "1 Acacias Etoile Hotel \n",
574
+ "2 COQ Hotel Paris \n",
575
+ "3 Campanile Paris 14 - Maine Montparnasse \n",
576
+ "4 Cler Hotel \n",
577
+ ".. ... \n",
578
+ "89 Sofitel Paris Le Faubourg \n",
579
+ "90 St Christopher's Gare du Nord Paris \n",
580
+ "91 St Christopher's Inn Canal Paris \n",
581
+ "92 Touring Hotel \n",
582
+ "93 Warwick Paris \n",
583
+ "\n",
584
+ " all_review \\\n",
585
+ "0 weve spent lots of time in paris and this was ... \n",
586
+ "1 the hotel is great for value the breakfast sel... \n",
587
+ "2 stayed for a short city break the hotel is a ... \n",
588
+ "3 room was very clean transportation is very ne... \n",
589
+ "4 we had the best stay at cler hotel the locati... \n",
590
+ ".. ... \n",
591
+ "89 4 years ago i was the last time at sofitel le ... \n",
592
+ "90 when arriving to the area it felt a little dan... \n",
593
+ "91 ive stayed at st christopher inn canal in pari... \n",
594
+ "92 hotel is in a great location minutes walk fro... \n",
595
+ "93 if i know of anybody heading to paris i will r... \n",
596
+ "\n",
597
+ " summary \n",
598
+ "0 we were blown away by this excellent hotel we ... \n",
599
+ "1 The hotel is great for value the breakfast sel... \n",
600
+ "2 stayed for a short city break the hotel is a ... \n",
601
+ "3 hotel turned out to be perfect for our short ... \n",
602
+ "4 we had the best stay at cler hotel the locati... \n",
603
+ ".. ... \n",
604
+ "89 4 years ago i was the last time at sofitel le ... \n",
605
+ "90 Barry is the best bartender in paris cheers gr... \n",
606
+ "91 ive stayed at st christopher inn canal in pari... \n",
607
+ "92 Hotel is in a great location minutes walk fro... \n",
608
+ "93 warwick hotel in paris is a good hotel to stay... \n",
609
+ "\n",
610
+ "[94 rows x 3 columns]"
611
+ ]
612
+ },
613
+ "execution_count": 11,
614
+ "metadata": {},
615
+ "output_type": "execute_result"
616
+ }
617
+ ],
618
+ "source": [
619
+ "df_combined_paris.merge(hotel,how='left')"
620
+ ]
621
+ },
622
+ {
623
+ "cell_type": "code",
624
+ "execution_count": null,
625
+ "metadata": {},
626
+ "outputs": [],
627
+ "source": []
628
+ }
629
+ ],
630
+ "metadata": {
631
+ "interpreter": {
632
+ "hash": "4bd624a0593993fe43ac4046b27b898fb2ef75c21c08f81e89e64ea0f51df676"
633
+ },
634
+ "kernelspec": {
635
+ "display_name": "Python 3 (ipykernel)",
636
+ "language": "python",
637
+ "name": "python3"
638
+ },
639
+ "language_info": {
640
+ "codemirror_mode": {
641
+ "name": "ipython",
642
+ "version": 3
643
+ },
644
+ "file_extension": ".py",
645
+ "mimetype": "text/x-python",
646
+ "name": "python",
647
+ "nbconvert_exporter": "python",
648
+ "pygments_lexer": "ipython3",
649
+ "version": "3.8.12"
650
+ }
651
+ },
652
+ "nbformat": 4,
653
+ "nbformat_minor": 4
654
+ }
tokenized_corpus.pickle ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e99b20be01f7889248d5b3f667df8947ae6ca676f3a525717305e5124c8b739e
3
+ size 1261235