AzizTh commited on
Commit
d4f5493
1 Parent(s): b72263b

Rename GroupZero_Week2Assesment.py to app.py

Browse files
Files changed (1) hide show
  1. GroupZero_Week2Assesment.py → app.py +26 -205
GroupZero_Week2Assesment.py → app.py RENAMED
@@ -1,183 +1,37 @@
1
- from IPython.display import HTML, display
2
-
3
- def set_css():
4
- display(HTML('''
5
- <style>
6
- pre {
7
- white-space: pre-wrap;
8
- }
9
- </style>
10
- '''))
11
- get_ipython().events.register('pre_run_cell', set_css)
12
-
13
-
14
- !pip install huggingface datasets
15
-
16
- !pip install sentence_transformers
17
-
18
  import pandas as pd
19
- from datasets import load_dataset
20
- import numpy as np
21
  from sentence_transformers import SentenceTransformer
22
- import torch
23
- import scipy.spatial
24
-
25
- dataset = load_dataset("traversaal-ai-hackathon/hotel_datasets")
26
-
27
- df=pd.DataFrame(dataset['train'])
28
- df.head()
29
-
30
- df.rate.value_counts()
31
-
32
-
33
- df.groupby('hotel_name')['rate'].apply(lambda x: x.isnull().sum()).sort_values(ascending=False)[1:40]
34
-
35
-
36
-
37
- df['hotel_description'].isnull().sum()
38
-
39
-
40
- df['hotel_name'].value_counts()
41
-
42
- hotel_rates = df.groupby('hotel_name')['rate'].first().to_dict()
43
- len(hotel_rates)
44
-
45
- # prompt: check unique values of hotel_rates and how many times each value repeated
46
-
47
- unique_rates, rate_counts = np.unique(list(hotel_rates.values()), return_counts=True)
48
-
49
- for rate, count in zip(unique_rates, rate_counts):
50
- print(f"Rate: {rate}, Count: {count}")
51
-
52
-
53
- # Define the function to fill missing rates
54
- def fill_rate(row):
55
- if pd.isna(row['rate']):
56
- return hotel_rates.get(row['hotel_name'], row['rate']) # Return the matched rate or leave it as NaN if no match
57
- else:
58
- return row['rate']
59
-
60
- # Apply the function to each row in the DataFrame
61
- df['rate'] = df.apply(fill_rate, axis=1)
62
-
63
- df['rate'].isnull().sum()
64
-
65
- df['locality'].value_counts()
66
-
67
-
68
- # Assuming df is your DataFrame
69
- # Create a mapping for rating_value
70
- rating_value_map = {
71
- 5.0: 'Very Satisfied Customer',
72
- 4.5: 'Satisfied Customer',
73
- 4.0: 'Moderately Satisfied Customer',
74
- 3.5: 'Neutral Customer',
75
- 3.0: 'Dissatisfied Customer'
76
- }
77
-
78
- # Create a mapping for price_range
79
- price_range_map = {
80
- '$ (Based on Average Nightly Rates for a Standard Room from our Partners)': 'Economical',
81
- '$$ (Based on Average Nightly Rates for a Standard Room from our Partners)': 'Moderate',
82
- '$$$ (Based on Average Nightly Rates for a Standard Room from our Partners)': 'Expensive'
83
- }
84
-
85
- # Create a mapping for rate
86
- rate_map = {
87
- 5.0: '5 Stars',
88
- 4.0: '4 Stars',
89
- 3.0: '3 Stars',
90
- 2.0: '2 Stars',
91
- 1.0: '1 Star',
92
- float('nan'): 'not known how many stars'
93
- }
94
-
95
- # Apply the mappings to the DataFrame
96
- df['rating_value'] = df['rating_value'].map(rating_value_map)
97
- df['price_range'] = df['price_range'].map(price_range_map)
98
- df['rate'] = df['rate'].map(rate_map)
99
-
100
- df.head()
101
 
102
- from sentence_transformers import SentenceTransformer
103
 
104
  model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
105
 
106
- def create_combined_embedding(row):
107
- # Use empty strings for None values
108
- description = (row['hotel_description'] or "").strip()
109
- rate = (row['rate'] or "").strip()
110
- price_range = (row['price_range'] or "").strip()
111
-
112
- combined_text = f"hotel_description: {description}; hotel star rate: {rate}; price range: {price_range}"
113
- embedding = model.encode(combined_text)
114
- return embedding.tolist() # Convert the embedding to a list
115
-
116
- # Assuming df is your DataFrame
117
- df["hotel_combined_embedding"] = df.apply(create_combined_embedding, axis=1)
118
-
119
- df.head()
120
-
121
- df['rating_value']
122
-
123
- customer_rating = df['rating_value'].tolist()
124
- customer_rating_embeddings = model.encode(customer_rating, show_progress_bar=True)#A
125
- print(f"customer_rating_embeddings shape: {customer_rating_embeddings.shape}")
126
-
127
- len(customer_rating_embeddings)
128
-
129
- # Convert embeddings to a list of lists (each embedding is a list)
130
- embedding_list = [embedding.tolist() for embedding in customer_rating_embeddings]
131
-
132
- # Add the embeddings as a new column to the original DataFrame
133
- df['rating_value_embedding'] = embedding_list
134
-
135
- df.head()
136
-
137
- df.to_csv('df.csv', index=True)
138
-
139
  df_new = pd.read_csv('last_df.csv')
140
 
141
- df_new.head()
142
-
143
- df_new['country'].unique()
144
 
145
  df_new['country'] = df_new['country'].replace('Türkiye', 'Turkey')
146
-
147
- df_new['country'].unique()
148
-
149
- df_new.head()
150
-
151
- !python -m spacy download en_core_web_trf
152
-
153
-
154
- import spacy
155
- import pandas as pd
156
-
157
- nlp = spacy.load("en_core_web_trf")
158
-
159
- # Function to extract city name from the query
160
- def get_city_name(query):
161
- text_query = nlp(query)
162
- for city in text_query.ents:
163
- if city.label_ == "GPE":
164
- return city.text.lower()
165
- return None
166
-
167
- # Function to filter DataFrame by location
168
- def filter_by_loc(query):
169
- city_name = get_city_name(query)
170
- if city_name in df_new['locality'].str.lower().unique():
171
- filtered_df = df_new[df_new['locality'].str.lower() == city_name.lower()]
172
- return filtered_df
173
- else:
174
- return df_new
175
 
176
 
177
- query = "cheap hotel in Istanbul"
178
-
179
- query_embedding = model.encode(query)
180
- query_embedding.shape
181
 
182
  import torch.nn as nn
183
  import torch
@@ -206,17 +60,18 @@ def process_query(query):
206
  query_embedding = model.encode(query)
207
 
208
  # Filter DataFrame by location
209
- filtered_data = filter_by_loc(query)
210
 
211
  # Convert query_embedding to a tensor if it is not already
212
  query_embedding_tensor = torch.tensor(query_embedding)
213
 
214
  # Apply the similarity function to the filtered DataFrame
215
- filtered_data['similarity_score'] = filtered_data.apply(lambda row: get_similarity_score(row, query_embedding_tensor), axis=1)
216
 
 
217
 
218
 
219
- top_similar = filtered_data.sort_values('similarity_score', ascending=False).head(1)
220
 
221
 
222
  hotel_name = top_similar['hotel_name'].values[0]
@@ -240,42 +95,8 @@ def process_query(query):
240
  return result
241
 
242
 
243
- # here is the returned df
244
-
245
- result_df = process_query(query)
246
-
247
- result_df
248
-
249
 
250
 
251
- # Extract the relevant information from the top similar hotel
252
- hotel_name = top_similar['hotel_name'].values[0]
253
- hotel_description = top_similar['hotel_description'].values[0]
254
- hotel_rate = top_similar['rate'].values[0]
255
- hotel_price_range = top_similar['price_range'].values[0]
256
- hotel_review = top_similar['review_title'].values[0]
257
- hotel_city = top_similar['locality'].values[0]
258
- hotel_country = top_similar['country'].values[0]
259
-
260
- # Print the information in an ordered fashion
261
- print("query: ",query)
262
- print("-" * 30)
263
- print("Here's the most similar hotel we found:")
264
- print("-" * 30)
265
- print(f"Hotel Name: {hotel_name}")
266
- print("City:", hotel_city)
267
- print("Country:", hotel_country)
268
- # print(f"Description: {hotel_description}")
269
- print(f"Star Rating: {hotel_rate}")
270
- print(f"Price Range: {hotel_price_range}")
271
-
272
-
273
-
274
-
275
- !pip install gradio
276
-
277
-
278
- import gradio as gr
279
 
280
  ui = gr.Interface(
281
  fn=process_query,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import pandas as pd
 
 
2
  from sentence_transformers import SentenceTransformer
3
+ import gradio as gr
4
+ import spacy
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
 
6
 
7
  model = SentenceTransformer("nomic-ai/nomic-embed-text-v1", trust_remote_code=True)
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  df_new = pd.read_csv('last_df.csv')
10
 
 
 
 
11
 
12
  df_new['country'] = df_new['country'].replace('Türkiye', 'Turkey')
13
+ #
14
+ #
15
+ # nlp = spacy.load("en_core_web_trf")
16
+ #
17
+ # # Function to extract city name from the query
18
+ # def get_city_name(query):
19
+ # text_query = nlp(query)
20
+ # for city in text_query.ents:
21
+ # if city.label_ == "GPE":
22
+ # return city.text.lower()
23
+ # return None
24
+ #
25
+ # # Function to filter DataFrame by location
26
+ # def filter_by_loc(query):
27
+ # city_name = get_city_name(query)
28
+ # if city_name in df_new['locality'].str.lower().unique():
29
+ # filtered_df = df_new[df_new['locality'].str.lower() == city_name.lower()]
30
+ # return filtered_df
31
+ # else:
32
+ # return df_new
 
 
 
 
 
 
 
 
 
33
 
34
 
 
 
 
 
35
 
36
  import torch.nn as nn
37
  import torch
 
60
  query_embedding = model.encode(query)
61
 
62
  # Filter DataFrame by location
63
+ # filtered_data = filter_by_loc(query)
64
 
65
  # Convert query_embedding to a tensor if it is not already
66
  query_embedding_tensor = torch.tensor(query_embedding)
67
 
68
  # Apply the similarity function to the filtered DataFrame
69
+ # filtered_data['similarity_score'] = filtered_data.apply(lambda row: get_similarity_score(row, query_embedding_tensor), axis=1)
70
 
71
+ df_new['similarity_score'] = df_new.apply(lambda row: get_similarity_score(row, query_embedding_tensor), axis=1)
72
 
73
 
74
+ top_similar = df_new.sort_values('similarity_score', ascending=False).head(1)
75
 
76
 
77
  hotel_name = top_similar['hotel_name'].values[0]
 
95
  return result
96
 
97
 
 
 
 
 
 
 
98
 
99
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  ui = gr.Interface(
102
  fn=process_query,