{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "import torch\n", "import transformers\n", "from transformers import BartTokenizer, BartForConditionalGeneration\n", "tr = BartTokenizer.from_pretrained('facebook/bart-large-cnn')\n", "mdl = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')\n", "torch_device = 'cpu'\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def bart_summarize(text, num_beams=20, length_penalty=2, max_length=2048, min_length=56, no_repeat_ngram_size=2):\n", "\n", " text = text.replace('\\n','')\n", " text_input_ids = tr.batch_encode_plus([text], return_tensors='pt', max_length=1024)['input_ids'].to(torch_device)\n", " summary_ids = mdl.generate(text_input_ids, num_beams=int(num_beams), length_penalty=float(length_penalty), max_length=int(max_length), min_length=int(min_length), no_repeat_ngram_size=int(no_repeat_ngram_size))\n", " summary_txt = tr.decode(summary_ids.squeeze(), skip_special_tokens=True)\n", " return summary_txt" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "from sentence_transformers import SentenceTransformer\n", "import scipy.spatial\n", "import pickle as pkl\n", "from sentence_transformers import SentenceTransformer, util\n", "import torch\n", "#import os\n", "\n", "\n", "df = pd.read_csv('combined_paris.csv')\n", "\n", "\n", "df_combined = df.sort_values(['Hotel']).groupby('Hotel', sort=False).review.apply(''.join).reset_index(name='all_review')\n", "\n", "import re\n", "\n", "df_combined['all_review'] = df_combined['all_review'].apply(lambda x: re.sub('[^a-zA-z0-9\\s]','',x))\n", "def lower_case(input_str):\n", " input_str = input_str.lower()\n", " return input_str" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "df_combined['all_review']= df_combined['all_review'].apply(lambda x: lower_case(x))\n", "\n", "df = df_combined\n", "\n", "df_sentences = df_combined.set_index(\"all_review\")\n", "\n", "df_sentences = df_sentences[\"Hotel\"].to_dict()\n", "df_sentences_list = list(df_sentences.keys())\n" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
01234
Hotel25hours Hotel Terminus NordAcacias Etoile HotelCOQ Hotel ParisCampanile Paris 14 - Maine MontparnasseCler Hotel
all_reviewweve spent lots of time in paris and this was ...the hotel is great for value the breakfast sel...stayed for a short city break the hotel is a ...room was very clean transportation is very ne...we had the best stay at cler hotel the locati...
\n", "
" ], "text/plain": [ " 0 \\\n", "Hotel 25hours Hotel Terminus Nord \n", "all_review weve spent lots of time in paris and this was ... \n", "\n", " 1 \\\n", "Hotel Acacias Etoile Hotel \n", "all_review the hotel is great for value the breakfast sel... \n", "\n", " 2 \\\n", "Hotel COQ Hotel Paris \n", "all_review stayed for a short city break the hotel is a ... \n", "\n", " 3 \\\n", "Hotel Campanile Paris 14 - Maine Montparnasse \n", "all_review room was very clean transportation is very ne... \n", "\n", " 4 \n", "Hotel Cler Hotel \n", "all_review we had the best stay at cler hotel the locati... " ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_combined.head().T" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.\n" ] } ], "source": [ "long_summary = []\n", "\n", "for i in range(len(df_combined)):\n", " t = bart_summarize(df_combined['all_review'][i])\n", " long_summary.append(t)" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "df_combined['summary'] = long_summary" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "df_combined.to_csv('df_combined_paris.csv',index=False)" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Hotelall_reviewsummary
025hours Hotel Terminus Nordweve spent lots of time in paris and this was ...we were blown away by this excellent hotel we ...
1Acacias Etoile Hotelthe hotel is great for value the breakfast sel...The hotel is great for value the breakfast sel...
2COQ Hotel Parisstayed for a short city break the hotel is a ...stayed for a short city break the hotel is a ...
3Campanile Paris 14 - Maine Montparnasseroom was very clean transportation is very ne...hotel turned out to be perfect for our short ...
4Cler Hotelwe had the best stay at cler hotel the locati...we had the best stay at cler hotel the locati...
\n", "
" ], "text/plain": [ " Hotel \\\n", "0 25hours Hotel Terminus Nord \n", "1 Acacias Etoile Hotel \n", "2 COQ Hotel Paris \n", "3 Campanile Paris 14 - Maine Montparnasse \n", "4 Cler Hotel \n", "\n", " all_review \\\n", "0 weve spent lots of time in paris and this was ... \n", "1 the hotel is great for value the breakfast sel... \n", "2 stayed for a short city break the hotel is a ... \n", "3 room was very clean transportation is very ne... \n", "4 we had the best stay at cler hotel the locati... \n", "\n", " summary \n", "0 we were blown away by this excellent hotel we ... \n", "1 The hotel is great for value the breakfast sel... \n", "2 stayed for a short city break the hotel is a ... \n", "3 hotel turned out to be perfect for our short ... \n", "4 we had the best stay at cler hotel the locati... " ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_combined.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Dockerfile df_combined.csv\n", "Hotel New York Combined.csv en_core_web_sm-3.2.0-py3-none-any.whl\n", "README.md query_generator.ipynb\n", "Untitled.ipynb requirements.txt\n", "app.py summary.ipynb\n", "app.yaml\n" ] } ], "source": [ "!ls" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/Users/aimzlicious/miniforge3/envs/tf_m1/lib/python3.8/site-packages/huggingface_hub/snapshot_download.py:6: FutureWarning: snapshot_download.py has been made private and will no longer be available from version 0.11. Please use `from huggingface_hub import snapshot_download` to import the only public function in this module. Other members of the file may be changed without a deprecation notice.\n", " warnings.warn(\n" ] } ], "source": [ "import pandas as pd\n", "from sentence_transformers import SentenceTransformer\n", "import scipy.spatial\n", "import pickle as pkl\n", "from sentence_transformers import SentenceTransformer, util\n", "import torch\n", "df_combined_paris_summary = pd.read_csv('df_combined_paris.csv')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Hotelall_reviewsummary
025hours Hotel Terminus Nordweve spent lots of time in paris and this was ...we were blown away by this excellent hotel we ...
1Acacias Etoile Hotelthe hotel is great for value the breakfast sel...The hotel is great for value the breakfast sel...
2COQ Hotel Parisstayed for a short city break the hotel is a ...stayed for a short city break the hotel is a ...
3Campanile Paris 14 - Maine Montparnasseroom was very clean transportation is very ne...hotel turned out to be perfect for our short ...
4Cler Hotelwe had the best stay at cler hotel the locati...we had the best stay at cler hotel the locati...
\n", "
" ], "text/plain": [ " Hotel \\\n", "0 25hours Hotel Terminus Nord \n", "1 Acacias Etoile Hotel \n", "2 COQ Hotel Paris \n", "3 Campanile Paris 14 - Maine Montparnasse \n", "4 Cler Hotel \n", "\n", " all_review \\\n", "0 weve spent lots of time in paris and this was ... \n", "1 the hotel is great for value the breakfast sel... \n", "2 stayed for a short city break the hotel is a ... \n", "3 room was very clean transportation is very ne... \n", "4 we had the best stay at cler hotel the locati... \n", "\n", " summary \n", "0 we were blown away by this excellent hotel we ... \n", "1 The hotel is great for value the breakfast sel... \n", "2 stayed for a short city break the hotel is a ... \n", "3 hotel turned out to be perfect for our short ... \n", "4 we had the best stay at cler hotel the locati... " ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_combined_paris.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "df_paris = pd.read_csv('paris_clean_newer.csv')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [], "source": [ "hotel=pd.DataFrame(df_paris['Hotel'].drop_duplicates())" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Hotelall_reviewsummary
025hours Hotel Terminus Nordweve spent lots of time in paris and this was ...we were blown away by this excellent hotel we ...
1Acacias Etoile Hotelthe hotel is great for value the breakfast sel...The hotel is great for value the breakfast sel...
2COQ Hotel Parisstayed for a short city break the hotel is a ...stayed for a short city break the hotel is a ...
3Campanile Paris 14 - Maine Montparnasseroom was very clean transportation is very ne...hotel turned out to be perfect for our short ...
4Cler Hotelwe had the best stay at cler hotel the locati...we had the best stay at cler hotel the locati...
............
89Sofitel Paris Le Faubourg4 years ago i was the last time at sofitel le ...4 years ago i was the last time at sofitel le ...
90St Christopher's Gare du Nord Pariswhen arriving to the area it felt a little dan...Barry is the best bartender in paris cheers gr...
91St Christopher's Inn Canal Parisive stayed at st christopher inn canal in pari...ive stayed at st christopher inn canal in pari...
92Touring Hotelhotel is in a great location minutes walk fro...Hotel is in a great location minutes walk fro...
93Warwick Parisif i know of anybody heading to paris i will r...warwick hotel in paris is a good hotel to stay...
\n", "

94 rows × 3 columns

\n", "
" ], "text/plain": [ " Hotel \\\n", "0 25hours Hotel Terminus Nord \n", "1 Acacias Etoile Hotel \n", "2 COQ Hotel Paris \n", "3 Campanile Paris 14 - Maine Montparnasse \n", "4 Cler Hotel \n", ".. ... \n", "89 Sofitel Paris Le Faubourg \n", "90 St Christopher's Gare du Nord Paris \n", "91 St Christopher's Inn Canal Paris \n", "92 Touring Hotel \n", "93 Warwick Paris \n", "\n", " all_review \\\n", "0 weve spent lots of time in paris and this was ... \n", "1 the hotel is great for value the breakfast sel... \n", "2 stayed for a short city break the hotel is a ... \n", "3 room was very clean transportation is very ne... \n", "4 we had the best stay at cler hotel the locati... \n", ".. ... \n", "89 4 years ago i was the last time at sofitel le ... \n", "90 when arriving to the area it felt a little dan... \n", "91 ive stayed at st christopher inn canal in pari... \n", "92 hotel is in a great location minutes walk fro... \n", "93 if i know of anybody heading to paris i will r... \n", "\n", " summary \n", "0 we were blown away by this excellent hotel we ... \n", "1 The hotel is great for value the breakfast sel... \n", "2 stayed for a short city break the hotel is a ... \n", "3 hotel turned out to be perfect for our short ... \n", "4 we had the best stay at cler hotel the locati... \n", ".. ... \n", "89 4 years ago i was the last time at sofitel le ... \n", "90 Barry is the best bartender in paris cheers gr... \n", "91 ive stayed at st christopher inn canal in pari... \n", "92 Hotel is in a great location minutes walk fro... \n", "93 warwick hotel in paris is a good hotel to stay... \n", "\n", "[94 rows x 3 columns]" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_combined_paris.merge(hotel,how='left')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "interpreter": { "hash": "4bd624a0593993fe43ac4046b27b898fb2ef75c21c08f81e89e64ea0f51df676" }, "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.12" } }, "nbformat": 4, "nbformat_minor": 4 }