Spaces:

hamza50
/

hotelfinder

Configuration error

App Files Files Community

hamza50 commited on Nov 1, 2022

Commit

27cc973

•

1 Parent(s): dd2e37f

Upload 23 files

Browse files

Files changed (24) hide show

.gitattributes +1 -0
Dockerfile +20 -0
Hotel New York Combined.csv +0 -0
README.md +1 -13
Untitled.ipynb +90 -0
app.py +297 -0
app.yaml +3 -0
basic.py +166 -0
combined_paris.csv +0 -0
corpus_embeddings_bi_encoder.pickle +3 -0
corpus_embeddings_bi_encoder.pickle 2 +0 -0
df_combined.csv +0 -0
df_combined_paris.csv +0 -0
embeddings.npy +3 -0
embeddings_h_r.npy +3 -0
embeddings_review.npy +3 -0
en_core_web_sm-3.2.0-py3-none-any.whl +3 -0
paris-newer.py +295 -0
paris.py +298 -0
paris_clean_newer.csv +0 -0
query_generator.ipynb +0 -0
requirements.txt +14 -0
summary.ipynb +654 -0
tokenized_corpus.pickle +3 -0

.gitattributes CHANGED Viewed

@@ -31,3 +31,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+en_core_web_sm-3.2.0-py3-none-any.whl filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,20 @@

+#Base Image to use
+FROM python:3.7.9
+#Expose port 8080
+EXPOSE 8080
+#Copy Requirements.txt file into app directory
+COPY requirements.txt app/requirements.txt
+#install all requirements in requirements.txt
+RUN pip3 install -r app/requirements.txt
+#Copy all files in current directory into app directory
+COPY . /app
+#Change Working Directory to app directory
+WORKDIR /app
+#Run the application on port 8080
+ENTRYPOINT ["streamlit", "run", "app.py", "--server.port=8080", "--server.address=0.0.0.0"]

Hotel New York Combined.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

README.md CHANGED Viewed

@@ -1,13 +1 @@
----
-title: Hotelfinder
-emoji: 🚀
-colorFrom: purple
-colorTo: yellow
-sdk: gradio
-sdk_version: 3.6
-app_file: app.py
-pinned: false
-license: creativeml-openrail-m
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference


1	+ assignment3

Untitled.ipynb ADDED Viewed

	@@ -0,0 +1,90 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "611a3e0e",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Init Plugin\n",
+      "Init Graph Optimizer\n",
+      "Init Kernel\n",
+      "Collecting en-core-web-sm==3.2.0\n",
+      "  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl (13.9 MB)\n",
+      "     |████████████████████████████████| 13.9 MB 463 kB/s            \n",
+      "\u001b[?25hRequirement already satisfied: spacy<3.3.0,>=3.2.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from en-core-web-sm==3.2.0) (3.2.1)\n",
+      "Requirement already satisfied: blis<0.8.0,>=0.4.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.7.5)\n",
+      "Requirement already satisfied: thinc<8.1.0,>=8.0.12 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (8.0.13)\n",
+      "Requirement already satisfied: srsly<3.0.0,>=2.4.1 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.4.2)\n",
+      "Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (4.62.3)\n",
+      "Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.6)\n",
+      "Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.6)\n",
+      "Requirement already satisfied: catalogue<2.1.0,>=2.0.6 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.6)\n",
+      "Requirement already satisfied: packaging>=20.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (21.0)\n",
+      "Requirement already satisfied: langcodes<4.0.0,>=3.2.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.3.0)\n",
+      "Requirement already satisfied: spacy-legacy<3.1.0,>=3.0.8 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.8)\n",
+      "Requirement already satisfied: jinja2 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.0.1)\n",
+      "Requirement already satisfied: numpy>=1.15.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.21.4)\n",
+      "Requirement already satisfied: pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.8.2)\n",
+      "Requirement already satisfied: spacy-loggers<2.0.0,>=1.0.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.0.1)\n",
+      "Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.0.6)\n",
+      "Requirement already satisfied: wasabi<1.1.0,>=0.8.1 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.8.2)\n",
+      "Requirement already satisfied: pathy>=0.3.5 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.6.1)\n",
+      "Requirement already satisfied: typer<0.5.0,>=0.3.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (0.4.0)\n",
+      "Requirement already satisfied: requests<3.0.0,>=2.13.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.26.0)\n",
+      "Requirement already satisfied: setuptools in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (59.0.1)\n",
+      "Requirement already satisfied: pyparsing>=2.0.2 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from packaging>=20.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.4.7)\n",
+      "Requirement already satisfied: smart-open<6.0.0,>=5.0.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from pathy>=0.3.5->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (5.2.1)\n",
+      "Requirement already satisfied: typing-extensions>=3.7.4.3 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from pydantic!=1.8,!=1.8.1,<1.9.0,>=1.7.4->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (4.1.1)\n",
+      "Requirement already satisfied: idna<4,>=2.5 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (3.2)\n",
+      "Requirement already satisfied: certifi>=2017.4.17 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2021.5.30)\n",
+      "Requirement already satisfied: charset-normalizer~=2.0.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.4)\n",
+      "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from requests<3.0.0,>=2.13.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (1.26.6)\n",
+      "Requirement already satisfied: click<9.0.0,>=7.1.1 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from typer<0.5.0,>=0.3.0->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (8.0.4)\n",
+      "Requirement already satisfied: MarkupSafe>=2.0 in /opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/lib/python3.9/site-packages (from jinja2->spacy<3.3.0,>=3.2.0->en-core-web-sm==3.2.0) (2.0.1)\n",
+      "\u001b[33mWARNING: You are using pip version 21.3.1; however, version 22.0.4 is available.\n",
+      "You should consider upgrading via the '/opt/homebrew/Caskroom/miniforge/base/envs/tensorflow/bin/python -m pip install --upgrade pip' command.\u001b[0m\n",
+      "\u001b[38;5;2m✔ Download and installation successful\u001b[0m\n",
+      "You can now load the package via spacy.load('en_core_web_sm')\n"
+     ]
+    }
+   ],
+   "source": [
+    "!python -m spacy download en_core_web_sm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "51a414e5",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.9.5 64-bit ('tensorflow': conda)",
+   "language": "python",
+   "name": "python395jvsc74a57bd04bd624a0593993fe43ac4046b27b898fb2ef75c21c08f81e89e64ea0f51df676"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

app.py ADDED Viewed

	@@ -0,0 +1,297 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+@author: Hamza Farooq
+"""
+import spacy
+from spacy.lang.en.stop_words import STOP_WORDS
+from string import punctuation
+from collections import Counter
+from heapq import nlargest
+import os
+nlp = spacy.load("en_core_web_sm")
+from spacy import displacy
+import streamlit as st
+import matplotlib.pyplot as plt
+from wordcloud import WordCloud
+from matplotlib import pyplot as plt
+import nltk
+nltk.download('stopwords')
+import geonamescache
+import os
+import streamlit as st
+import utils as utl
+from PIL import Image
+import time
+import torch
+import transformers
+from transformers import BartTokenizer, BartForConditionalGeneration
+tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+mdl = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
+torch_device = 'gpu'
+def main():
+    # Settings
+    st.set_page_config(layout="wide", page_title='New York Hotels')
+    def bart_summarize(text, num_beams=20, length_penalty=2, max_length=2048, min_length=56, no_repeat_ngram_size=2):
+      text = text.replace('\n','')
+      text_input_ids = tr.batch_encode_plus([text], return_tensors='pt', max_length=1024)['input_ids'].to(torch_device)
+      summary_ids = mdl.generate(text_input_ids, num_beams=int(num_beams), length_penalty=float(length_penalty), max_length=int(max_length), min_length=int(min_length), no_repeat_ngram_size=int(no_repeat_ngram_size))
+      summary_txt = tr.decode(summary_ids.squeeze(), skip_special_tokens=True)
+      return summary_txt
+    gc = geonamescache.GeonamesCache()
+    # gets nested dictionary for countries
+    countries = gc.get_countries()
+    # gets nested dictionary for cities
+    cities = gc.get_cities()
+    # def gen_dict_extract(var, key):
+    #     if isinstance(var, dict):
+    #         for k, v in var.items():
+    #             if k == key:
+    #                 yield v
+    #             if isinstance(v, (dict, list)):
+    #                 yield from gen_dict_extract(v, key)
+    #     elif isinstance(var, list):
+    #         for d in var:
+    #             yield from gen_dict_extract(d, key)
+    #
+    # cities = [*gen_dict_extract(cities, 'name')]
+    # countries = [*gen_dict_extract(countries, 'name')]
+    #
+    # cities.append('New York')
+    from nltk.corpus import stopwords
+    stopwords = set(stopwords.words('english'))
+    #mask = np.array(Image.open('upvote.png'))
+    from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
+    import matplotlib.pyplot as plt
+    #original_title = '<p style="font-family:IBM Mono; color:Blue; font-size: 20px;">Original image</p>'
+    st.title("New York Hotel Finder")
+    stopwords=list(STOP_WORDS)
+    stopwords.extend(['hotel','room','rooms'])
+    from string import punctuation
+    punctuation=punctuation+ '\n'
+    import pandas as pd
+    from sentence_transformers import SentenceTransformer
+    import scipy.spatial
+    import pickle as pkl
+    from sentence_transformers import SentenceTransformer, util
+    import torch
+    #import os
+    embedder = SentenceTransformer('all-MiniLM-L6-v2')
+    df_all = pd.read_csv('Hotel New York Combined.csv')
+    df_all = df_all[['hotel_name','review_body']]
+    #
+    # df['hotel_name'].drop_duplicates()
+    # df_combined = df.sort_values(['hotel_name']).groupby('hotel_name', sort=False).review_body.apply(''.join).reset_index(name='all_review')
+    import re
+    df_combined = pd.read_csv('df_combined.csv')
+    # df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
+    #
+    # def lower_case(input_str):
+    #     input_str = input_str.lower()
+    #     return input_str
+    #
+    # df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x))
+    df = df_combined
+    df_sentences = df_combined.set_index("all_review")
+    df_sentences = df_sentences["hotel_name"].to_dict()
+    df_sentences_list = list(df_sentences.keys())
+    import pandas as pd
+    from tqdm import tqdm
+    from sentence_transformers import SentenceTransformer, util
+    df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]
+    #
+    corpus = df_sentences_list
+    corpus_embeddings = embedder.encode(corpus,show_progress_bar=True)
+    #
+    # model = SentenceTransformer('all-MiniLM-L6-v2')
+    # paraphrases = util.paraphrase_mining(model, corpus)
+    #queries = ['Hotel close to Central Park',
+    #           'Hotel with breakfast'
+    #           ]
+    # from transformers import AutoTokenizer, AutoModel
+    # import torch
+    # import torch.nn.functional as F
+    #
+    # #Mean Pooling - Take attention mask into account for correct averaging
+    # def mean_pooling(model_output, attention_mask):
+    #     token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    #     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    #     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    #
+    #
+    # # Sentences we want sentence embeddings for
+    # sentences = corpus
+    #
+    # # Load model from HuggingFace Hub
+    # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v1')
+    # model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v1')
+    #
+    # # Tokenize sentences
+    # encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
+    #
+    # # Compute token embeddings
+    # with torch.no_grad():
+    #     model_output = model(**encoded_input)
+    #
+    # # Perform pooling
+    # sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+    #
+    # # Normalize embeddings
+    # sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
+    #
+    # st.text("Sentence embeddings:")
+    # st.text(sentence_embeddings)
+    #
+    #
+    #corpus_embeddings = sentence_embeddings
+    # Query sentences
+    def plot_cloud(wordcloud):
+        # Set figure size
+        st.pyplot.figure(figsize=(40, 30))
+        # Display image
+        st.pyplot(wordcloud)
+        # No axis details
+        #st.pyplot.axis("off");
+    userinput = st.text_input('Tell us what are you looking in your hotel?')
+    if not userinput:
+        st.write("Please enter a query to get results")
+    else:
+        query = [str(userinput)]
+        doc = nlp(str(userinput))
+        for ent in doc.ents:
+            if ent.label_ == 'GPE':
+                if ent.text in countries:
+                    st.write(f"Country : {ent.text}")
+                elif ent.text in cities:
+                    st.write("city")
+                    st.write(ent.text)
+                    st.write(f"City : {ent.text}")
+                else:
+                    print(f"Other GPE : {ent.text}")
+        # query_embeddings = embedder.encode(queries,show_progress_bar=True)
+        top_k = min(5, len(corpus))
+        query_embedding = embedder.encode(query, convert_to_tensor=True)
+        # We use cosine-similarity and torch.topk to find the highest 5 scores
+        cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
+        top_results = torch.topk(cos_scores, k=top_k)
+        # st.write("\n\n======================\n\n")
+        # st.write("Query:", query)
+        # # doc = nlp(query)
+        sentence_spans = list(doc.sents)
+        ent_html = displacy.render(doc, style="ent", jupyter=False)
+# Display the entity visualization in the browser:
+        st.markdown(ent_html, unsafe_allow_html=True)
+        #displacy.render(doc, jupyter = True, style="ent")
+        st.write("##")
+        st.subheader("\n\n\n\n\n\nTop 5 most relevant hotels:\n\n\n\n\n\n\n")
+        st.write("\n\n======================\n\n")
+        for score, idx in zip(top_results[0], top_results[1]):
+            row_dict = df.loc[df['all_review']== corpus[idx]]
+            st.subheader(row_dict['hotel_name'].values[0])
+            hotel_subset = df_all.loc[df_all['hotel_name']==row_dict['hotel_name'].values[0]]
+            st.caption("Review Summary:")
+            st.write(row_dict['summary'].values[0])
+            st.caption("Relevancy: {:.4f}".format(score))
+            st.caption("Relevant reviews:")
+            df_sentences_h = hotel_subset.set_index("review_body")
+            df_sentences_h = df_sentences_h["hotel_name"].to_dict()
+            df_sentences_list_h = list(df_sentences_h.keys())
+            df_sentences_list_h = [str(d) for d in tqdm(df_sentences_list_h)]
+            #
+            corpus_h = df_sentences_list_h
+            corpus_embeddings_h = embedder.encode(corpus_h,show_progress_bar=True)
+            cos_scores_h = util.pytorch_cos_sim(query_embedding, corpus_embeddings_h)[0]
+            top_results_h = torch.topk(cos_scores_h, k=top_k)
+            for score, idx in zip(top_results_h[0], top_results_h[1]):
+                st.write(corpus_h[idx])
+            # st.table(hotel_subset.head())
+            # st.write("#")
+            #wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='navy', colormap='rainbow', collocations=False, stopwords = STOPWORDS, mask=mask).generate(corpus[idx])
+            # wordcloud = WordCloud(collocations=False,stopwords=stopwords,background_color='black',max_words=35).generate(corpus[idx])
+            # fig, ax = plt.subplots()
+            # plt.imshow(wordcloud, interpolation='bilinear')
+            # plt.axis("off")
+            # plt.show()
+            # st.pyplot(fig)
+            # st.set_option('deprecation.showPyplotGlobalUse', False)
+if __name__ == '__main__':
+    main()
+    # cos_scores = util.pytorch_cos_sim(query_embedding, sentence_embeddings)[0]
+    # top_results = torch.topk(cos_scores, k=top_k)
+    # st.write("\n\n======================\n\n")
+    # st.write("Query:", query)
+    # st.write("\nTop 5 most similar sentences in corpus using sentence embedding:")
+    #
+    # for score, idx in zip(top_results[0], top_results[1]):
+    #     st.write("(Score: {:.4f})".format(score))
+    #     row_dict = df.loc[df['all_review']== corpus[idx]]
+    #     st.write("paper_id:  " , row_dict['hotel_name'] , "\n")
+    #     #wordcloud = WordCloud(width = 3000, height = 2000, random_state=1, background_color='navy', colormap='rainbow', collocations=False, stopwords = STOPWORDS, mask=mask).generate(corpus[idx])
+    #     wordcloud = WordCloud(collocations=False,stopwords=stopwords,background_color='black',max_words=35).generate(corpus[idx])
+    #     fig, ax = plt.subplots()
+    #     plt.imshow(wordcloud, interpolation='bilinear')
+    #     plt.axis("off")
+    #     plt.show()
+    #     st.pyplot(fig)
+    #     st.set_option('deprecation.showPyplotGlobalUse', False)
+# embedder = SentenceTransformer('all-MiniLM-L6-v2')
+#
+# corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)
+# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity

app.yaml ADDED Viewed

	@@ -0,0 +1,3 @@

+service: default
+runtime: custom
+env: flex

basic.py ADDED Viewed

	@@ -0,0 +1,166 @@

+#!/usr/bin/env python
+from datetime import datetime
+from time import time
+from lxml import html,etree
+from reviews_final import scrape, write_in_csv
+import pandas as pd
+import requests,re
+import os,sys
+import unicodecsv as csv
+import argparse
+import numpy as np
+import json
+def clean(text):
+    if text:
+        # Removing \n \r and \t
+        return ' '.join(''.join(text).split()).strip()
+    return None
+def parse(locality,checkin_date,checkout_date,sort):
+    checkIn = checkin_date.strftime("%Y/%m/%d")
+    checkOut = checkout_date.strftime("%Y/%m/%d")
+    print ("Scraper Inititated for Locality:%s"%locality)
+    header = {
+                            'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.81 Safari/537.36'
+            }
+    # TA rendering the autocomplete list using this API
+    print ("Finding search result page URL")
+    geo_url = 'https://www.tripadvisor.com/TypeAheadJson?action=API&startTime='+str(int(time()))+'&uiOrigin=GEOSCOPE&source=GEOSCOPE&interleaved=true&types=geo,theme_park&neighborhood_geos=true&link_type=hotel&details=true&max=12&injectNeighborhoods=true&query='+locality
+    api_response  = requests.get(geo_url,headers=header, timeout=120).json()
+    #getting the TA url for th equery from the autocomplete response
+    url_from_autocomplete = "http://www.tripadvisor.com"+api_response['results'][0]['url']
+    print ('URL found %s'%url_from_autocomplete)
+    geo = api_response['results'][0]['value']
+    #Formating date for writing to file
+    a=url_from_autocomplete
+    b=a.split("-")
+    s="-"
+    c=s.join([b[0],b[1],"oa30",b[2],b[3]])
+    d=s.join([b[0],b[1],"oa60",b[2],b[3]])
+    e=s.join([b[0],b[1],"oa90",b[2],b[3]])
+    f=s.join([b[0],b[1],"oa120",b[2],b[3]])
+    urllist = [a,c,d,e,f]
+    date = checkin_date.strftime("%Y_%m_%d")+"_"+checkout_date.strftime("%Y_%m_%d")
+    #form data to get the hotels list from TA for the selected date
+    form_data = {'changeSet': 'TRAVEL_INFO',
+            'showSnippets': 'false',
+            'staydates':date,
+            'uguests': '2',
+            'sortOrder':sort
+    }
+    json_arr = []
+    for url_from_autocomplete in urllist:
+        print(url_from_autocomplete)
+        headers = {
+                                'Accept': 'text/javascript, text/html, application/xml, text/xml, */*',
+                                'Accept-Encoding': 'gzip,deflate',
+                                'Accept-Language': 'en-US,en;q=0.5',
+                                'Cache-Control': 'no-cache',
+                                'Connection': 'keep-alive',
+                                'Content-Type': 'application/x-www-form-urlencoded; charset=utf-8',
+                                'Host': 'www.tripadvisor.com',
+                                'Pragma': 'no-cache',
+                                'Referer': url_from_autocomplete,
+                                'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:28.0) Gecko/20100101 Firefox/28.0',
+                                'X-Requested-With': 'XMLHttpRequest'
+                            }
+        cookies=  {"SetCurrency":"USD"}
+        print ("Downloading search results page")
+        page_response  = requests.post(url = url_from_autocomplete,data=form_data,headers = headers, cookies = cookies, verify=False)
+        print ("Parsing results ")
+        parser = html.fromstring(page_response.text)
+        hotel_lists = parser.xpath('//div[contains(@class,"listItem")]//div[contains(@class,"listing collapsed")]')
+        hotel_data = []
+        if not hotel_lists:
+            hotel_lists = parser.xpath('//div[contains(@class,"listItem")]//div[@class="listing "]')
+        for hotel in hotel_lists:
+            XPATH_HOTEL_LINK = './/a[contains(@class,"property_title")]/@href'
+            XPATH_REVIEWS  = './/a[@class="review_count"]//text()'
+            XPATH_RANK = './/div[@class="popindex"]//text()'
+            XPATH_RATING = './/span[contains(@class,"ui_bubble_rating bubble_45")]/@alt' #update this code to get rating
+            XPATH_RATING_2 = './/a[contains(@class,"ui_bubble_rating bubble_45")]/@alt' #update this code to get rating
+            XPATH_HOTEL_NAME = './/a[contains(@class,"property_title")]//text()'
+            XPATH_HOTEL_FEATURES = './/div[contains(@casls,"common_hotel_icons_list")]//li//text()'
+            XPATH_HOTEL_PRICE = './/div[contains(@data-sizegroup,"mini-meta-price")]/text()'
+            XPATH_VIEW_DEALS = './/div[contains(@data-ajax-preserve,"viewDeals")]//text()'
+            XPATH_BOOKING_PROVIDER = './/div[contains(@data-sizegroup,"mini-meta-provider")]//text()'  #<span class="dekGp Ci _R S4 H3 MD">#74 of 319 hotels in Lisbon</span><span class="dekGp Ci _R S4 H3 MD">#6 of 319 hotels in Lisbon</span>
+            XPATH_RATING_ORDER = './/span[contains(@class,"dekGp Ci _R S4 H3 MD")]//text()'
+            XPATH_OFFICIAL_DESCRIPTION = '//div[contains(text(),"Description")]/following-sibling::div//span[contains(@class,"introText")]/text()'
+            raw_booking_provider = hotel.xpath(XPATH_BOOKING_PROVIDER)
+            raw_no_of_deals =  hotel.xpath(XPATH_VIEW_DEALS)
+            raw_hotel_link = hotel.xpath(XPATH_HOTEL_LINK)
+            raw_no_of_reviews = hotel.xpath(XPATH_REVIEWS)
+            raw_rank = hotel.xpath(XPATH_RANK)
+            raw_rating = hotel.xpath(XPATH_RATING_2)
+            raw_hotel_name = hotel.xpath(XPATH_HOTEL_NAME)
+            raw_hotel_features = hotel.xpath(XPATH_HOTEL_FEATURES)
+            raw_hotel_price_per_night  = hotel.xpath(XPATH_HOTEL_PRICE)
+            raw_rank_order = hotel.xpath(XPATH_RATING_ORDER)
+            raw_official_description = parser.xpath(XPATH_OFFICIAL_DESCRIPTION)
+            url = 'http://www.tripadvisor.com'+raw_hotel_link[0] if raw_hotel_link else  None
+            reviews = ''.join(raw_no_of_reviews).replace("reviews","").replace(",","") if raw_no_of_reviews else 0
+            rank = ''.join(raw_rank) if raw_rank else None
+            rating = ''.join(raw_rating).replace('of 5 bubbles','').strip() if raw_rating else None
+            name = ''.join(raw_hotel_name).strip() if raw_hotel_name else None
+            hotel_features = ','.join(raw_hotel_features)
+            #price_per_night = ''.join(raw_hotel_price_per_night).encode('utf-8').replace('\n','') if raw_hotel_price_per_night else None
+            price_per_night = ''.join(raw_hotel_price_per_night).replace('\n','') if raw_hotel_price_per_night else None
+            rank_order = ''.join(raw_rank_order) if raw_rank_order else None
+            no_of_deals = re.findall("all\s+?(\d+)\s+?",''.join(raw_no_of_deals))
+            booking_provider = ''.join(raw_booking_provider).strip() if raw_booking_provider else None
+            official_description = clean(raw_official_description)
+            if no_of_deals:
+                no_of_deals = no_of_deals[0]
+            else:
+                no_of_deals = 0
+            data = {
+                        'hotel_name':name,
+                        'url':url,
+                        'locality':locality,
+                        'reviews':reviews,
+                        'rank':rank,
+                        'tripadvisor_rating':rating,
+                        'checkOut':checkOut,
+                        'checkIn':checkIn,
+                        'hotel_features':hotel_features,
+                        'price_per_night':price_per_night,
+                        'no_of_deals':no_of_deals,
+                        'booking_provider':booking_provider,
+                        'raw_rank': rank_order,
+                        'desc':official_description
+            }
+            if data:
+                print("Writing scraped data")
+                json_arr.append(data)
+                with open('data_file.json', 'w') as outfile:
+                    json.dump(json_arr, outfile)
+    #         hotel_data.append(data)
+    #         all_hotel.append(data)
+    # #Referrer is necessary to get the correct response from TA if not provided they will redirect to home page
+    # my_df=pd.DataFrame(all_hotel)
+    # print(my_df['hotel_name'])
+    return urllist

combined_paris.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

corpus_embeddings_bi_encoder.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:1854af45783940daefdea27ee8e42f026faefdc4ff4a41067c6ee4ca6eb74ade
+size 64918

corpus_embeddings_bi_encoder.pickle 2 ADDED Viewed

Binary file (64.9 kB). View file

df_combined.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

df_combined_paris.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

embeddings.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3759225896afa4282dee721d96d1d1a8085cde7ccffe29e975568a5499a36548
+size 64640

embeddings_h_r.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:76ae4840488129cd4c6917830018219292cca514e62c69ea9e507b185d219aa7
+size 4391552

embeddings_review.npy ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:96cee6d14a75d19eccbe9decb501dd3c5de6c1fe401d3803a82611f075a8a6a8
+size 144512

en_core_web_sm-3.2.0-py3-none-any.whl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5e02939fb7fbae6dbcc9c5a1355f5e4e02939b649a1f0846ee844ac1d479bbeb
+size 13900196

paris-newer.py ADDED Viewed

	@@ -0,0 +1,295 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+@author: Hamza Farooq
+"""
+import spacy
+from spacy.lang.en.stop_words import STOP_WORDS
+from string import punctuation
+from collections import Counter
+from heapq import nlargest
+import os
+nlp = spacy.load("en_core_web_sm")
+from sentence_transformers import SentenceTransformer, CrossEncoder, util
+import datetime
+from spacy import displacy
+import streamlit as st
+import matplotlib.pyplot as plt
+from wordcloud import WordCloud
+from matplotlib import pyplot as plt
+import nltk
+from rank_bm25 import BM25Okapi
+from sklearn.feature_extraction import _stop_words
+import string
+from tqdm.autonotebook import tqdm
+import numpy as np
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+import scipy.spatial
+import pickle
+from sentence_transformers import SentenceTransformer, util
+import torch
+# import utils as utl
+import time
+import torch
+import transformers
+from transformers import BartTokenizer, BartForConditionalGeneration
+from string import punctuation
+# tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+import numpy as np
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+import scipy.spatial
+from sentence_transformers import SentenceTransformer, util
+import torch
+def main():
+    # Settings
+    st.set_page_config(layout="wide", page_title='Paris Hotel Finder', page_icon="🎈"   )
+    from string import punctuation
+    punctuation=punctuation+ '\n'
+    from sentence_transformers import SentenceTransformer, util
+    import torch
+    import numpy as np
+    import pandas as pd
+    from sentence_transformers import SentenceTransformer
+    import scipy.spatial
+    from sentence_transformers import SentenceTransformer, util
+    import torch
+    #import os
+    @st.cache(allow_output_mutation=True)
+    def load_model():
+        return SentenceTransformer('all-MiniLM-L6-v2'),SentenceTransformer('multi-qa-MiniLM-L6-cos-v1'),CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')
+    embedder,bi_encoder,cross_encoder = load_model()
+    #original_title = '<p style="font-family:IBM Mono; color:Blue; font-size: 20px;">Original image</p>'
+    st.title("Parisian Hotel Finder")
+    with st.expander("ℹ️ - About this app", expanded=True):
+        st.write(
+            """
+    -   This app allows you to search for hotels based on what you're looking for, rather than just cities - it helps with reducing time to go through exhaustive reviews for each hotel!
+    -   It uses an innovative semantic search approach that leverages multiple NLP embeddings and relies on [Transformers] (https://huggingface.co/transformers/) 🤗.
+    	    """
+        )
+    punctuation=punctuation+ '\n'
+    #import os
+    # embedder = SentenceTransformer('all-MiniLM-L6-v2')
+    def lower_case(input_str):
+        input_str = input_str.lower()
+        return input_str
+    df_all = pd.read_csv('paris_clean_newer.csv')
+    df_combined = df_all.sort_values(['Hotel']).groupby('Hotel', sort=False).text.apply(''.join).reset_index(name='all_review')
+    df_combined_paris_summary = pd.read_csv('df_combined_paris.csv')
+    df_combined_paris_summary = df_combined_paris_summary[['Hotel','summary']]
+    import re
+    # df_combined = pd.read_csv('df_combined.csv')
+    df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
+    df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x))
+    df_basic = df_all[['Hotel','description','price_per_night']].drop_duplicates()
+    df_basic = df_basic.merge(df_combined_paris_summary,how='left')
+    df_combined_e = df_combined.merge(df_basic)
+    df_combined_e['all_review'] =df_combined_e['description']+ df_combined_e['all_review'] + df_combined_e['price_per_night']
+    df = df_combined_e.copy()
+    df_sentences = df_combined_e.set_index("all_review")
+    df_sentences = df_sentences["Hotel"].to_dict()
+    df_sentences_list = list(df_sentences.keys())
+    import pandas as pd
+    from tqdm import tqdm
+    from sentence_transformers import SentenceTransformer, util
+    df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]
+    #
+    corpus = df_sentences_list
+    # corpus_embeddings = embedder.encode(corpus,show_progress_bar=True)
+    corpus_embeddings = np.load('embeddings.npy')
+    bi_encoder.max_seq_length = 512     #Truncate long passages to 256 tokens
+    top_k = 32                          #Number of passages we want to retrieve with the bi-encoder
+    #The bi-encoder will retrieve 100 documents. We use a cross-encoder, to re-rank the results list to improve the quality
+    # corpus_embeddings_h = np.load('embeddings_h_r.npy')
+    with open('corpus_embeddings_bi_encoder.pickle', 'rb') as pkl:
+        doc_embedding = pickle.load(pkl)
+    with open('tokenized_corpus.pickle', 'rb') as pkl:
+        tokenized_corpus = pickle.load(pkl)
+    bm25 = BM25Okapi(tokenized_corpus)
+    passages = corpus
+# We lower case our text and remove stop-words from indexing
+    def bm25_tokenizer(text):
+        tokenized_doc = []
+        for token in text.lower().split():
+            token = token.strip(string.punctuation)
+            if len(token) > 0 and token not in _stop_words.ENGLISH_STOP_WORDS:
+                tokenized_doc.append(token)
+        return tokenized_doc
+    def search(query):
+        # q = [str(userinput)]
+        doc = nlp(str(userinput))
+        ent_html = displacy.render(doc, style="ent", jupyter=False)
+# Display the entity visualization in the browser:
+        st.markdown(ent_html, unsafe_allow_html=True)
+        ##### BM25 search (lexical search) #####
+        bm25_scores = bm25.get_scores(bm25_tokenizer(query))
+        top_n = np.argpartition(bm25_scores, -5)[-5:]
+        bm25_hits = [{'corpus_id': idx, 'score': bm25_scores[idx]} for idx in top_n]
+        bm25_hits = sorted(bm25_hits, key=lambda x: x['score'], reverse=True)
+        bm25list = {}
+        st.title("Top-5 lexical search (BM25) hits")
+        for hit in bm25_hits[0:5]:
+            row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]]
+            st.subheader(row_dict['Hotel'].values[0])
+            de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]]
+            st.write(f'\tPrice Per night: {de.price_per_night.values[0]}')
+            st.write('Description:')
+            st.expander(de.description.values[0],expanded=False)
+            # try:
+            #     st.write('Summary')
+            #     st.expander(de.summary.values[0],expanded=False)
+            # except:
+            #     None
+            # doc = corpus[hit['corpus_id']]
+            # kp.get_key_phrases(doc)
+            bm25list[row_dict['Hotel'].values[0]] = de.description.values[0][0:200]
+        #### Sematic Search #####
+        # Encode the query using the bi-encoder and find potentially relevant passages
+        question_embedding = bi_encoder.encode(query, convert_to_tensor=True)
+    #     question_embedding = question_embedding.cuda()
+        hits = util.semantic_search(question_embedding, corpus_embeddings, top_k=top_k)
+        hits = hits[0]  # Get the hits for the first query
+        ##### Re-Ranking #####
+        # Now, score all retrieved passages with the cross_encoder
+        cross_inp = [[query, passages[hit['corpus_id']]] for hit in hits]
+        cross_scores = cross_encoder.predict(cross_inp)
+        # Sort results by the cross-encoder scores
+        for idx in range(len(cross_scores)):
+            hits[idx]['cross-score'] = cross_scores[idx]
+        # Output of top-5 hits from bi-encoder
+        st.write("\n-------------------------\n")
+        st.title("Top-5 Bi-Encoder Retrieval hits")
+        hits = sorted(hits, key=lambda x: x['score'], reverse=True)
+        for hit in hits[0:5]:
+    #         st.write("\t{:.3f}\t{}".format(hit['score'], passages[hit['corpus_id']].replace("\n", " ")))
+            row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]]
+            st.subheader(row_dict['Hotel'].values[0])
+            de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]]
+            st.write(f'\tPrice Per night: {de.price_per_night.values[0]}')
+            st.write('Description:')
+            st.expander(de.description.values[0])
+            # try:
+            #     st.write('Summary')
+            #     st.expander(de.summary.values[0],expanded=False)
+            # except:
+            #     None
+        # Output of top-5 hits from re-ranker
+        st.write("\n-------------------------\n")
+        st.title("Top-5 Cross-Encoder Re-ranker hits")
+        hits = sorted(hits, key=lambda x: x['cross-score'], reverse=True)
+        for hit in hits[0:5]:
+    #         st.write("\t{:.3f}\t{}".format(hit['cross-score'], passages[hit['corpus_id']].replace("\n", " ")))
+            row_dict = df.loc[df['all_review']== corpus[hit['corpus_id']]]
+            st.subheader(row_dict['Hotel'].values[0])
+            de = df_basic.loc[df_basic.Hotel == row_dict['Hotel'].values[0]]
+            st.write(f'\tPrice Per night: {de.price_per_night.values[0]}')
+            st.write('Description:')
+            st.expander(de.description.values[0])
+            # try:
+            #     st.write('Summary')
+            #     st.expander(de.summary.values[0],expanded=False)
+            # except:
+            #     None
+    sampletext = 'e.g. Hotel near Eiffel Tower with big rooms'
+    userinput = st.text_input('Tell us what are you looking in your hotel?','e.g. Hotel near Eiffel Tower with big rooms',autocomplete="on")
+    da = st.date_input(
+        "Date Check-in",
+        datetime.date(2022, 10, 5))
+    dst = st.date_input(
+        "Date Check-out",
+        datetime.date(2022, 10, 8))
+    if not userinput or userinput == sampletext:
+        st.write("Please enter a query to get results")
+    else:
+        query = [str(userinput)]
+        doc = nlp(str(userinput))
+        search(str(userinput))
+        # We use cosine-similarity and torch.topk to find the highest 5 scores
+if __name__ == '__main__':
+    main()

paris.py ADDED Viewed

	@@ -0,0 +1,298 @@

+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+@author: Hamza Farooq
+"""
+import spacy
+from spacy.lang.en.stop_words import STOP_WORDS
+from string import punctuation
+from collections import Counter
+from heapq import nlargest
+import os
+nlp = spacy.load("en_core_web_sm")
+from spacy import displacy
+import streamlit as st
+import matplotlib.pyplot as plt
+from wordcloud import WordCloud
+from matplotlib import pyplot as plt
+import nltk
+# import utils as utl
+import time
+import torch
+import transformers
+from transformers import BartTokenizer, BartForConditionalGeneration
+from string import punctuation
+# tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
+import numpy as np
+import pandas as pd
+from sentence_transformers import SentenceTransformer
+import scipy.spatial
+import pickle as pkl
+from sentence_transformers import SentenceTransformer, util
+import torch
+def main():
+    # Settings
+    st.set_page_config(layout="wide", page_title='Paris Hotel Finder', page_icon="🎈"   )
+    from string import punctuation
+    punctuation=punctuation+ '\n'
+    # def bart_summarize(text, num_beams=20, length_penalty=2, max_length=2048, min_length=56, no_repeat_ngram_size=2):
+    #
+    #   text = text.replace('\n','')
+    #   text_input_ids = tr.batch_encode_plus([text], return_tensors='pt', max_length=1024)['input_ids'].to(torch_device)
+    #   summary_ids = mdl.generate(text_input_ids, num_beams=int(num_beams), length_penalty=float(length_penalty), max_length=int(max_length), min_length=int(min_length), no_repeat_ngram_size=int(no_repeat_ngram_size))
+    #   summary_txt = tr.decode(summary_ids.squeeze(), skip_special_tokens=True)
+    #   return summary_txt
+    from sentence_transformers import SentenceTransformer, util
+    import torch
+    import numpy as np
+    import pandas as pd
+    from sentence_transformers import SentenceTransformer
+    import scipy.spatial
+    import pickle as pkl
+    from sentence_transformers import SentenceTransformer, util
+    import torch
+    #import os
+    @st.cache(allow_output_mutation=True)
+    def load_model():
+        return SentenceTransformer('all-MiniLM-L6-v2')
+    embedder = load_model()
+    # embedder = SentenceTransformer('all-MiniLM-L6-v2')
+    # gc = geonamescache.GeonamesCache()
+    #
+    # # gets nested dictionary for countries
+    # countries = gc.get_countries()
+    #
+    # # gets nested dictionary for cities
+    # cities = gc.get_cities()
+    # def gen_dict_extract(var, key):
+    #     if isinstance(var, dict):
+    #         for k, v in var.items():
+    #             if k == key:
+    #                 yield v
+    #             if isinstance(v, (dict, list)):
+    #                 yield from gen_dict_extract(v, key)
+    #     elif isinstance(var, list):
+    #         for d in var:
+    #             yield from gen_dict_extract(d, key)
+    #
+    # cities = [*gen_dict_extract(cities, 'name')]
+    # countries = [*gen_dict_extract(countries, 'name')]
+    #
+    # cities.append('New York')
+    # mask = np.array(Image.open('upvote.png'))
+    #original_title = '<p style="font-family:IBM Mono; color:Blue; font-size: 20px;">Original image</p>'
+    st.title("Parisian Hotel Finder")
+    with st.expander("ℹ️ - About this app", expanded=True):
+        st.write(
+            """
+    -   This app allows you to search for hotels based on what you're looking for, rather than just cities - it helps with reducing time to go through exhaustive reviews for each hotel!
+    -   It uses an innovative semantic search approach that leverages multiple NLP embeddings and relies on [Transformers] (https://huggingface.co/transformers/) 🤗.
+    	    """
+        )
+    punctuation=punctuation+ '\n'
+    #import os
+    # embedder = SentenceTransformer('all-MiniLM-L6-v2')
+    df_all = pd.read_csv('combined_paris.csv')
+    df_all = df_all[['Hotel','review']]
+    df_all = df_all.drop_duplicates()
+    df_all = df_all.reset_index(drop=True)
+    summary_hotel = pd.read_csv('df_combined_paris.csv')
+    #
+    # df['hotel_name'].drop_duplicates()
+    df_combined = df_all.sort_values(['Hotel']).groupby('Hotel', sort=False).review.apply(''.join).reset_index(name='all_review')
+    import re
+    # df_combined = pd.read_csv('df_combined.csv')
+    df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))
+    def lower_case(input_str):
+        input_str = input_str.lower()
+        return input_str
+    df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x))
+    df = df_combined
+    df_sentences = df_combined.set_index("all_review")
+    df_sentences = df_sentences["Hotel"].to_dict()
+    df_sentences_list = list(df_sentences.keys())
+    import pandas as pd
+    from tqdm import tqdm
+    from sentence_transformers import SentenceTransformer, util
+    df_sentences_list = [str(d) for d in tqdm(df_sentences_list)]
+    #
+    corpus = df_sentences_list
+    # corpus_embeddings = embedder.encode(corpus,show_progress_bar=True)
+    corpus_embeddings = np.load('embeddings_review.npy')
+    corpus_embeddings_h = np.load('embeddings_h_r.npy')
+    #
+    # model = SentenceTransformer('all-MiniLM-L6-v2')
+    # paraphrases = util.paraphrase_mining(model, corpus)
+    #queries = ['Hotel close to Central Park',
+    #           'Hotel with breakfast'
+    #           ]
+    # from transformers import AutoTokenizer, AutoModel
+    # import torch
+    # import torch.nn.functional as F
+    #
+    # #Mean Pooling - Take attention mask into account for correct averaging
+    # def mean_pooling(model_output, attention_mask):
+    #     token_embeddings = model_output[0] #First element of model_output contains all token embeddings
+    #     input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
+    #     return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)
+    #
+    #
+    # # Sentences we want sentence embeddings for
+    # sentences = corpus
+    #
+    # # Load model from HuggingFace Hub
+    # tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L12-v1')
+    # model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L12-v1')
+    #
+    # # Tokenize sentences
+    # encoded_input = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')
+    #
+    # # Compute token embeddings
+    # with torch.no_grad():
+    #     model_output = model(**encoded_input)
+    #
+    # # Perform pooling
+    # sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
+    #
+    # # Normalize embeddings
+    # sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)
+    #
+    # st.text("Sentence embeddings:")
+    # st.text(sentence_embeddings)
+    #
+    #
+    #corpus_embeddings = sentence_embeddings
+    # Query sentences
+    def plot_cloud(wordcloud):
+        # Set figure size
+        st.pyplot.figure(figsize=(20, 10))
+        # Display image
+        st.pyplot(wordcloud)
+        # No axis details
+        #st.pyplot.axis("off");
+    sampletext = 'e.g. Hotel near Eiffel Tower with big rooms'
+    userinput = st.text_input('Tell us what are you looking in your hotel?','e.g. Hotel near Eiffel Tower with big rooms',autocomplete="on")
+    if not userinput or userinput == sampletext:
+        st.write("Please enter a query to get results")
+    else:
+        query = [str(userinput)]
+        doc = nlp(str(userinput))
+        # for ent in doc.ents:
+        #     if ent.label_ == 'GPE':
+        #         if ent.text in countries:
+        #             st.write(f"Country : {ent.text}")
+        #         elif ent.text in cities:
+        #             st.write("city")
+        #             st.write(ent.text)
+        #             st.write(f"City : {ent.text}")
+        #         else:
+        #             print(f"Other GPE : {ent.text}")
+        # query_embeddings = embedder.encode(queries,show_progress_bar=True)
+        top_k = min(5, len(corpus))
+        query_embedding = embedder.encode(query, convert_to_tensor=True)
+        # We use cosine-similarity and torch.topk to find the highest 5 scores
+        cos_scores = util.pytorch_cos_sim(query_embedding, corpus_embeddings)[0]
+        top_results = torch.topk(cos_scores, k=top_k)
+        # st.write("\n\n======================\n\n")
+        # st.write("Query:", query)
+        # # doc = nlp(query)
+        sentence_spans = list(doc.sents)
+        ent_html = displacy.render(doc, style="ent", jupyter=False)
+# Display the entity visualization in the browser:
+        st.markdown(ent_html, unsafe_allow_html=True)
+        #displacy.render(doc, jupyter = True, style="ent")
+        st.write("##")
+        st.subheader("\n\n\n\n\n\nTop 5 most relevant hotels:\n\n\n\n\n\n\n")
+        st.write("\n\n======================\n\n")
+        for score, idx in zip(top_results[0], top_results[1]):
+            row_dict = df.loc[df['all_review']== corpus[idx]]
+            st.subheader(row_dict['Hotel'].values[0])
+            hotel_subset = df_all.loc[df_all['Hotel']==row_dict['Hotel'].values[0]]
+            hotel_sub = summary_hotel.loc[summary_hotel['Hotel']==row_dict['Hotel'].values[0]]
+            st.caption("Review Summary:")
+            st.write(hotel_sub['summary'].values[0])
+            st.caption("Relevancy: {:.4f}".format(score))
+            st.caption("Relevant reviews:")
+            df_sentences_h = hotel_subset.set_index("review")
+            df_sentences_h = df_sentences_h["Hotel"].to_dict()
+            df_sentences_list_h = list(df_sentences_h.keys())
+            df_sentences_list_h = [str(d) for d in tqdm(df_sentences_list_h)]
+            #
+            corpus_h = df_sentences_list_h
+            # corpus_embeddings_h = embedder.encode(corpus_h,show_progress_bar=True)
+            sublist = [element for i, element in enumerate(corpus_embeddings_h) if i in (df_all[df_all['Hotel'] == row_dict['Hotel'].values[0]].index.values)]
+            cos_scores_h = util.pytorch_cos_sim(query_embedding, sublist)[0]
+            top_results_h = torch.topk(cos_scores_h, k=top_k)
+            for score, idx in zip(top_results_h[0], top_results_h[1]):
+                st.write(corpus_h[idx])
+if __name__ == '__main__':
+    main()

paris_clean_newer.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

query_generator.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,14 @@

+pandas
+streamlit==1.1.0
+regex==2021.8.3
+sklearn
+sentence_transformers
+scipy
+tqdm
+gensim
+plotly
+wordcloud
+matplotlib
+spacy
+https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.2.0/en_core_web_sm-3.2.0-py3-none-any.whl
+rank-bm25

summary.ipynb ADDED Viewed

	@@ -0,0 +1,654 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "import transformers\n",
+    "from transformers import BartTokenizer, BartForConditionalGeneration\n",
+    "tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn')\n",
+    "mdl = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')\n",
+    "torch_device = 'cpu'\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def bart_summarize(text, num_beams=20, length_penalty=2, max_length=2048, min_length=56, no_repeat_ngram_size=2):\n",
+    "\n",
+    "      text = text.replace('\\n','')\n",
+    "      text_input_ids = tr.batch_encode_plus([text], return_tensors='pt', max_length=1024)['input_ids'].to(torch_device)\n",
+    "      summary_ids = mdl.generate(text_input_ids, num_beams=int(num_beams), length_penalty=float(length_penalty), max_length=int(max_length), min_length=int(min_length), no_repeat_ngram_size=int(no_repeat_ngram_size))\n",
+    "      summary_txt = tr.decode(summary_ids.squeeze(), skip_special_tokens=True)\n",
+    "      return summary_txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import pandas as pd\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "import scipy.spatial\n",
+    "import pickle as pkl\n",
+    "from sentence_transformers import SentenceTransformer, util\n",
+    "import torch\n",
+    "#import os\n",
+    "\n",
+    "\n",
+    "df = pd.read_csv('combined_paris.csv')\n",
+    "\n",
+    "\n",
+    "df_combined = df.sort_values(['Hotel']).groupby('Hotel', sort=False).review.apply(''.join).reset_index(name='all_review')\n",
+    "\n",
+    "import re\n",
+    "\n",
+    "df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\\s]','',x))\n",
+    "def lower_case(input_str):\n",
+    "        input_str = input_str.lower()\n",
+    "        return input_str"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x))\n",
+    "\n",
+    "df = df_combined\n",
+    "\n",
+    "df_sentences = df_combined.set_index(\"all_review\")\n",
+    "\n",
+    "df_sentences = df_sentences[\"Hotel\"].to_dict()\n",
+    "df_sentences_list = list(df_sentences.keys())\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>0</th>\n",
+       "      <th>1</th>\n",
+       "      <th>2</th>\n",
+       "      <th>3</th>\n",
+       "      <th>4</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>Hotel</th>\n",
+       "      <td>25hours Hotel Terminus Nord</td>\n",
+       "      <td>Acacias Etoile Hotel</td>\n",
+       "      <td>COQ Hotel Paris</td>\n",
+       "      <td>Campanile Paris 14 - Maine Montparnasse</td>\n",
+       "      <td>Cler Hotel</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>all_review</th>\n",
+       "      <td>weve spent lots of time in paris and this was ...</td>\n",
+       "      <td>the hotel is great for value the breakfast sel...</td>\n",
+       "      <td>stayed for a short city break  the hotel is a ...</td>\n",
+       "      <td>room was very clean  transportation is very ne...</td>\n",
+       "      <td>we had the best stay at cler hotel  the locati...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                                            0  \\\n",
+       "Hotel                             25hours Hotel Terminus Nord   \n",
+       "all_review  weve spent lots of time in paris and this was ...   \n",
+       "\n",
+       "                                                            1  \\\n",
+       "Hotel                                    Acacias Etoile Hotel   \n",
+       "all_review  the hotel is great for value the breakfast sel...   \n",
+       "\n",
+       "                                                            2  \\\n",
+       "Hotel                                         COQ Hotel Paris   \n",
+       "all_review  stayed for a short city break  the hotel is a ...   \n",
+       "\n",
+       "                                                            3  \\\n",
+       "Hotel                 Campanile Paris 14 - Maine Montparnasse   \n",
+       "all_review  room was very clean  transportation is very ne...   \n",
+       "\n",
+       "                                                            4  \n",
+       "Hotel                                              Cler Hotel  \n",
+       "all_review  we had the best stay at cler hotel  the locati...  "
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_combined.head().T"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n"
+     ]
+    }
+   ],
+   "source": [
+    "long_summary = []\n",
+    "\n",
+    "for i in range(len(df_combined)):\n",
+    "    t = bart_summarize(df_combined['all_review'][i])\n",
+    "    long_summary.append(t)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_combined['summary'] = long_summary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_combined.to_csv('df_combined_paris.csv',index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Hotel</th>\n",
+       "      <th>all_review</th>\n",
+       "      <th>summary</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>25hours Hotel Terminus Nord</td>\n",
+       "      <td>weve spent lots of time in paris and this was ...</td>\n",
+       "      <td>we were blown away by this excellent hotel we ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Acacias Etoile Hotel</td>\n",
+       "      <td>the hotel is great for value the breakfast sel...</td>\n",
+       "      <td>The hotel is great for value the breakfast sel...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>COQ Hotel Paris</td>\n",
+       "      <td>stayed for a short city break  the hotel is a ...</td>\n",
+       "      <td>stayed for a short city break  the hotel is a ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Campanile Paris 14 - Maine Montparnasse</td>\n",
+       "      <td>room was very clean  transportation is very ne...</td>\n",
+       "      <td>hotel turned out to be perfect for our short ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Cler Hotel</td>\n",
+       "      <td>we had the best stay at cler hotel  the locati...</td>\n",
+       "      <td>we had the best stay at cler hotel  the locati...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                     Hotel  \\\n",
+       "0              25hours Hotel Terminus Nord   \n",
+       "1                     Acacias Etoile Hotel   \n",
+       "2                          COQ Hotel Paris   \n",
+       "3  Campanile Paris 14 - Maine Montparnasse   \n",
+       "4                               Cler Hotel   \n",
+       "\n",
+       "                                          all_review  \\\n",
+       "0  weve spent lots of time in paris and this was ...   \n",
+       "1  the hotel is great for value the breakfast sel...   \n",
+       "2  stayed for a short city break  the hotel is a ...   \n",
+       "3  room was very clean  transportation is very ne...   \n",
+       "4  we had the best stay at cler hotel  the locati...   \n",
+       "\n",
+       "                                             summary  \n",
+       "0  we were blown away by this excellent hotel we ...  \n",
+       "1  The hotel is great for value the breakfast sel...  \n",
+       "2  stayed for a short city break  the hotel is a ...  \n",
+       "3   hotel turned out to be perfect for our short ...  \n",
+       "4  we had the best stay at cler hotel  the locati...  "
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_combined.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dockerfile                            df_combined.csv\n",
+      "Hotel New York Combined.csv           en_core_web_sm-3.2.0-py3-none-any.whl\n",
+      "README.md                             query_generator.ipynb\n",
+      "Untitled.ipynb                        requirements.txt\n",
+      "app.py                                summary.ipynb\n",
+      "app.yaml\n"
+     ]
+    }
+   ],
+   "source": [
+    "!ls"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/Users/aimzlicious/miniforge3/envs/tf_m1/lib/python3.8/site-packages/huggingface_hub/snapshot_download.py:6: FutureWarning: snapshot_download.py has been made private and will no longer be available from version 0.11. Please use `from huggingface_hub import snapshot_download` to import the only public function in this module. Other members of the file may be changed without a deprecation notice.\n",
+      "  warnings.warn(\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "from sentence_transformers import SentenceTransformer\n",
+    "import scipy.spatial\n",
+    "import pickle as pkl\n",
+    "from sentence_transformers import SentenceTransformer, util\n",
+    "import torch\n",
+    "df_combined_paris_summary = pd.read_csv('df_combined_paris.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Hotel</th>\n",
+       "      <th>all_review</th>\n",
+       "      <th>summary</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>25hours Hotel Terminus Nord</td>\n",
+       "      <td>weve spent lots of time in paris and this was ...</td>\n",
+       "      <td>we were blown away by this excellent hotel we ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Acacias Etoile Hotel</td>\n",
+       "      <td>the hotel is great for value the breakfast sel...</td>\n",
+       "      <td>The hotel is great for value the breakfast sel...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>COQ Hotel Paris</td>\n",
+       "      <td>stayed for a short city break  the hotel is a ...</td>\n",
+       "      <td>stayed for a short city break  the hotel is a ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Campanile Paris 14 - Maine Montparnasse</td>\n",
+       "      <td>room was very clean  transportation is very ne...</td>\n",
+       "      <td>hotel turned out to be perfect for our short ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Cler Hotel</td>\n",
+       "      <td>we had the best stay at cler hotel  the locati...</td>\n",
+       "      <td>we had the best stay at cler hotel  the locati...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                     Hotel  \\\n",
+       "0              25hours Hotel Terminus Nord   \n",
+       "1                     Acacias Etoile Hotel   \n",
+       "2                          COQ Hotel Paris   \n",
+       "3  Campanile Paris 14 - Maine Montparnasse   \n",
+       "4                               Cler Hotel   \n",
+       "\n",
+       "                                          all_review  \\\n",
+       "0  weve spent lots of time in paris and this was ...   \n",
+       "1  the hotel is great for value the breakfast sel...   \n",
+       "2  stayed for a short city break  the hotel is a ...   \n",
+       "3  room was very clean  transportation is very ne...   \n",
+       "4  we had the best stay at cler hotel  the locati...   \n",
+       "\n",
+       "                                             summary  \n",
+       "0  we were blown away by this excellent hotel we ...  \n",
+       "1  The hotel is great for value the breakfast sel...  \n",
+       "2  stayed for a short city break  the hotel is a ...  \n",
+       "3   hotel turned out to be perfect for our short ...  \n",
+       "4  we had the best stay at cler hotel  the locati...  "
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_combined_paris.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df_paris = pd.read_csv('paris_clean_newer.csv')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hotel=pd.DataFrame(df_paris['Hotel'].drop_duplicates())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>Hotel</th>\n",
+       "      <th>all_review</th>\n",
+       "      <th>summary</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>25hours Hotel Terminus Nord</td>\n",
+       "      <td>weve spent lots of time in paris and this was ...</td>\n",
+       "      <td>we were blown away by this excellent hotel we ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>Acacias Etoile Hotel</td>\n",
+       "      <td>the hotel is great for value the breakfast sel...</td>\n",
+       "      <td>The hotel is great for value the breakfast sel...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>COQ Hotel Paris</td>\n",
+       "      <td>stayed for a short city break  the hotel is a ...</td>\n",
+       "      <td>stayed for a short city break  the hotel is a ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>Campanile Paris 14 - Maine Montparnasse</td>\n",
+       "      <td>room was very clean  transportation is very ne...</td>\n",
+       "      <td>hotel turned out to be perfect for our short ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>Cler Hotel</td>\n",
+       "      <td>we had the best stay at cler hotel  the locati...</td>\n",
+       "      <td>we had the best stay at cler hotel  the locati...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>...</th>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "      <td>...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>89</th>\n",
+       "      <td>Sofitel Paris Le Faubourg</td>\n",
+       "      <td>4 years ago i was the last time at sofitel le ...</td>\n",
+       "      <td>4 years ago i was the last time at sofitel le ...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>90</th>\n",
+       "      <td>St Christopher's Gare du Nord Paris</td>\n",
+       "      <td>when arriving to the area it felt a little dan...</td>\n",
+       "      <td>Barry is the best bartender in paris cheers gr...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>91</th>\n",
+       "      <td>St Christopher's Inn Canal Paris</td>\n",
+       "      <td>ive stayed at st christopher inn canal in pari...</td>\n",
+       "      <td>ive stayed at st christopher inn canal in pari...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>92</th>\n",
+       "      <td>Touring Hotel</td>\n",
+       "      <td>hotel is in a great location  minutes walk fro...</td>\n",
+       "      <td>Hotel is in a great location  minutes walk fro...</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>93</th>\n",
+       "      <td>Warwick Paris</td>\n",
+       "      <td>if i know of anybody heading to paris i will r...</td>\n",
+       "      <td>warwick hotel in paris is a good hotel to stay...</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>94 rows × 3 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                                      Hotel  \\\n",
+       "0               25hours Hotel Terminus Nord   \n",
+       "1                      Acacias Etoile Hotel   \n",
+       "2                           COQ Hotel Paris   \n",
+       "3   Campanile Paris 14 - Maine Montparnasse   \n",
+       "4                                Cler Hotel   \n",
+       "..                                      ...   \n",
+       "89                Sofitel Paris Le Faubourg   \n",
+       "90      St Christopher's Gare du Nord Paris   \n",
+       "91         St Christopher's Inn Canal Paris   \n",
+       "92                            Touring Hotel   \n",
+       "93                            Warwick Paris   \n",
+       "\n",
+       "                                           all_review  \\\n",
+       "0   weve spent lots of time in paris and this was ...   \n",
+       "1   the hotel is great for value the breakfast sel...   \n",
+       "2   stayed for a short city break  the hotel is a ...   \n",
+       "3   room was very clean  transportation is very ne...   \n",
+       "4   we had the best stay at cler hotel  the locati...   \n",
+       "..                                                ...   \n",
+       "89  4 years ago i was the last time at sofitel le ...   \n",
+       "90  when arriving to the area it felt a little dan...   \n",
+       "91  ive stayed at st christopher inn canal in pari...   \n",
+       "92  hotel is in a great location  minutes walk fro...   \n",
+       "93  if i know of anybody heading to paris i will r...   \n",
+       "\n",
+       "                                              summary  \n",
+       "0   we were blown away by this excellent hotel we ...  \n",
+       "1   The hotel is great for value the breakfast sel...  \n",
+       "2   stayed for a short city break  the hotel is a ...  \n",
+       "3    hotel turned out to be perfect for our short ...  \n",
+       "4   we had the best stay at cler hotel  the locati...  \n",
+       "..                                                ...  \n",
+       "89  4 years ago i was the last time at sofitel le ...  \n",
+       "90  Barry is the best bartender in paris cheers gr...  \n",
+       "91  ive stayed at st christopher inn canal in pari...  \n",
+       "92  Hotel is in a great location  minutes walk fro...  \n",
+       "93  warwick hotel in paris is a good hotel to stay...  \n",
+       "\n",
+       "[94 rows x 3 columns]"
+      ]
+     },
+     "execution_count": 11,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df_combined_paris.merge(hotel,how='left')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "interpreter": {
+   "hash": "4bd624a0593993fe43ac4046b27b898fb2ef75c21c08f81e89e64ea0f51df676"
+  },
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

tokenized_corpus.pickle ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e99b20be01f7889248d5b3f667df8947ae6ca676f3a525717305e5124c8b739e
+size 1261235