pdjewell commited on
Commit
0e36e26
1 Parent(s): 4999104

first commit to hf space

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
app.py ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import os
4
+ from PIL import Image
5
+ import streamlit as st
6
+ from streamlit import components
7
+ from datasets import Dataset, load_dataset, load_from_disk
8
+ import faiss
9
+ from scripts.preprocessing import preprocess
10
+
11
+ # App config
12
+ icon = Image.open('./images/wine_icon.png')
13
+ st.set_page_config(page_title="Sommeli-AI",
14
+ page_icon=icon,
15
+ layout="wide")
16
+ hide_default_format = """
17
+ <style>
18
+ #MainMenu {visibility: visible; }
19
+ footer {visibility: hidden;}
20
+ </style>
21
+ """
22
+ st.markdown(hide_default_format, unsafe_allow_html=True)
23
+
24
+ # App functions
25
+ @st.cache_data
26
+ def read_data(ds_path=None):
27
+
28
+ if ds_path is not None:
29
+ # Read in hf file
30
+ embeddings_dataset = load_from_disk(ds_path)
31
+ else:
32
+ embeddings_dataset = load_dataset("pdjewell/sommeli_ai", split="train")
33
+
34
+ # Convert to pandas df
35
+ embeddings_dataset.set_format("pandas")
36
+ df = embeddings_dataset[:]
37
+
38
+ # preprocess data (add type col, remove dups)
39
+ df = preprocess(df)
40
+
41
+ return df
42
+
43
+
44
+ def get_neighbours(df, query_embedding, k=6,
45
+ metric='inner'):
46
+
47
+ # convert from pandas df to hf ds
48
+ ds = Dataset.from_pandas(df)
49
+ ds.reset_format()
50
+ ds = ds.with_format("np")
51
+
52
+ # add faiss index
53
+ if metric == 'inner':
54
+ ds.add_faiss_index(column="embeddings",
55
+ metric_type=faiss.METRIC_INNER_PRODUCT)
56
+ else:
57
+ ds.add_faiss_index(column="embeddings",
58
+ metric_type=faiss.METRIC_L2)
59
+
60
+ scores, samples = ds.get_nearest_examples(
61
+ "embeddings", query_embedding, k=k)
62
+
63
+ samples.pop('embeddings')
64
+ samples.pop('__index_level_0__')
65
+
66
+ return scores, samples
67
+
68
+
69
+ def filter_df_search(df: pd.DataFrame) -> pd.DataFrame:
70
+
71
+ modify_search = st.checkbox("🔍 Further filter search selection")
72
+
73
+ if not modify_search:
74
+ return df
75
+
76
+ df = df.copy()
77
+
78
+ modification_container_search = st.container()
79
+
80
+ with modification_container_search:
81
+ to_filter_columns = st.multiselect("Filter on:",
82
+ ['Province', 'Region', 'Winery','Score', 'Price'],
83
+ key='search')
84
+
85
+ for column in to_filter_columns:
86
+ if column in ['Score', 'Price']: # Use slider for 'points' and 'price'
87
+ min_val = 0
88
+ max_val = int(df[column].max())
89
+ user_input = st.slider(f"Values for {column}", min_val, max_val, (min_val, max_val))
90
+ df = df[(df[column] >= user_input[0]) & (df[column] <= user_input[1])]
91
+ elif column in ['Country', 'Province', 'Region', 'Variety', 'Winery']: # Use multiselect for these columns
92
+ unique_values = df[column].dropna().unique()
93
+ default_values = [unique_values[0]] if len(unique_values) > 0 else [] # Select only the first unique value if it exists
94
+ user_input = st.multiselect(f"Values for {column}", unique_values, default_values)
95
+ df = df[df[column].isin(user_input)]
96
+
97
+ return df
98
+
99
+
100
+ def filter_df_recs(df: pd.DataFrame) -> pd.DataFrame:
101
+
102
+ modify_recs = st.checkbox("🔍 Filter recommendation results")
103
+
104
+ if not modify_recs:
105
+ return df
106
+
107
+ df = df.copy()
108
+
109
+ modification_container_recs = st.container()
110
+
111
+ with modification_container_recs:
112
+
113
+ to_filter_columns2 = st.multiselect("Filter on:",
114
+ ['Country','Province', 'Region', 'Variety', 'Winery',
115
+ 'Score', 'Price'],
116
+ key='recs')
117
+
118
+ for column in to_filter_columns2:
119
+ if column in ['Score', 'Price']: # Use slider for 'points' and 'price'
120
+ min_val = 0
121
+ max_val = int(df[column].max())
122
+ user_input = st.slider(f"Values for {column}", min_val, max_val, (min_val, max_val))
123
+ df = df[(df[column] >= user_input[0]) & (df[column] <= user_input[1])]
124
+ elif column in ['Country', 'Province', 'Region', 'Variety', 'Winery']: # Use multiselect for these columns
125
+ unique_values = df[column].dropna().unique()
126
+ default_values = [unique_values[0]] if len(unique_values) > 0 else [] # Select only the first unique value if it exists
127
+ user_input = st.multiselect(f"Values for {column}", unique_values, default_values)
128
+ df = df[df[column].isin(user_input)]
129
+
130
+ return df
131
+
132
+
133
+ if __name__ == "__main__":
134
+ st.title("🍷 Sommeli-AI")
135
+ col1, col2 = st.columns([0.6,0.4], gap="medium")
136
+
137
+ # Read in data
138
+ ds_path = "./data/wine_ds.hf"
139
+ df = read_data(ds_path=None)
140
+
141
+ with col2:
142
+ st.header("Explore the world of wine 🌍")
143
+ wine_plot = st.radio('Select plot type:', ['2D','3D'],
144
+ label_visibility = "hidden",
145
+ horizontal=True)
146
+ st.text("Click the legend categories to filter")
147
+
148
+ # Load the HTML file
149
+ with open('./images/px_2d.html', 'r') as file:
150
+ plot2d_html = file.read()
151
+ # Load the HTML file
152
+ with open('./images/px_3d.html', 'r') as file:
153
+ plot3d_html = file.read()
154
+ # Display the HTML plot in the Streamlit app
155
+ if wine_plot == '2D':
156
+ components.v1.html(plot2d_html, width=512, height=512)
157
+ elif wine_plot == '3D':
158
+ components.v1.html(plot3d_html, width=512, height=512)
159
+
160
+ with col1:
161
+
162
+ # Select all wine types initially
163
+ st.header("Search for similar wines 🥂")
164
+ # Select wine type: default is all
165
+ wine_types = df['Type'].unique()
166
+ selected_wine_types = st.multiselect("Select category 👇", wine_types, default=wine_types)
167
+ df = df[df['Type'].isin(selected_wine_types)]
168
+ subcol1, subcol2 = st.columns([0.5,0.5], gap="small")
169
+ with subcol1:
170
+ # Select wine variety: default is all
171
+ wine_vars = df['Variety'].unique()
172
+ selected_wine_vars = st.multiselect("Narrow down the variety 🍇",['Select all'] + list(wine_vars),
173
+ default = 'Select all')
174
+ if "Select all" in selected_wine_vars:
175
+ df_search = df
176
+ else:
177
+ df_search = df[df['Variety'].isin(selected_wine_vars)]
178
+
179
+ with subcol2:
180
+ # Select the country: default is all
181
+ countries = df_search['Country'].unique()
182
+ selected_countries = st.multiselect("Narrow down the country 🌎",['Select all'] + list(countries),
183
+ default = 'Select all')
184
+ if "Select all" in selected_countries:
185
+ df_search = df_search
186
+ else:
187
+ df_search = df_search[df_search['Country'].isin(selected_countries)]
188
+
189
+ # Add additional filters
190
+ df_search = filter_df_search(df_search)
191
+
192
+ # Create a search bar for the wine 'title'
193
+ selected_wine = st.selectbox("Search for and select a wine 👇", [''] + list(df_search["Title"].unique()))
194
+
195
+ if selected_wine:
196
+ # Get the embedding for selected_wine
197
+ query_embedding = df.loc[df['Title']==selected_wine, 'embeddings'].iloc[0]
198
+
199
+ tasting_notes = df.loc[df['Title']==selected_wine, 'Tasting notes'].iloc[0]
200
+ st.write(f"Tasting notes: {tasting_notes}")
201
+
202
+ # CSS to inject contained in a string
203
+ hide_table_row_index = """
204
+ <style>
205
+ thead tr th:first-child {display:none}
206
+ tbody th {display:none}
207
+ </style>
208
+ """
209
+ # Inject CSS with Markdown
210
+ st.markdown(hide_table_row_index, unsafe_allow_html=True)
211
+
212
+ # Display selected wine
213
+ st.header(" 🍷 Your selected wine")
214
+ selected_cols = ['Title','Country','Province','Region','Winery',
215
+ 'Variety','Tasting notes','Score']
216
+ st.table(df.loc[df['Title']==selected_wine, selected_cols].fillna(""))
217
+
218
+ # Slider for results to show
219
+ k = st.slider(f"Choose how many similar wines to show 👇", 1, 10, value=4)
220
+
221
+ # Filter recommendation results
222
+ df_results = filter_df_recs(df)
223
+
224
+ # Display results as table
225
+ if st.button("🔘 Press me to generate similar tasting wines"):
226
+ # Get neighbours
227
+ scores, samples = get_neighbours(df_results, query_embedding,
228
+ k=k+1, metric='l2')
229
+ recs_df = pd.DataFrame(samples).fillna("")
230
+ recs_df = recs_df.fillna(" ")
231
+ # Display results
232
+ st.header(f"🍾 Top {k} similar tasting wines")
233
+ st.table(recs_df.loc[1:,selected_cols])
234
+
235
+ else:
236
+ print("Awaiting selection")
images/.DS_Store ADDED
Binary file (6.15 kB). View file
 
images/px.html ADDED
The diff for this file is too large to render. See raw diff
 
images/px_2d.html ADDED
The diff for this file is too large to render. See raw diff
 
images/px_3d.html ADDED
The diff for this file is too large to render. See raw diff
 
images/screenshot_1.png ADDED
images/screenshot_2.png ADDED
images/tsne.png ADDED
images/wine_icon.png ADDED
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ datasets==2.12.0
2
+ faiss-cpu==1.7.4
3
+ gitdb==4.0.10
4
+ numpy==1.24.3
5
+ pandas==2.0.2
6
+ Pillow==9.5.0
7
+ pyarrow==12.0.0
8
+ streamlit==1.23.1
scripts/.DS_Store ADDED
Binary file (6.15 kB). View file
 
scripts/__pycache__/preprocessing.cpython-310.pyc ADDED
Binary file (12 kB). View file
 
scripts/preprocessing.py ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import os
4
+ from pathlib import Path
5
+
6
+ def preprocess(df):
7
+
8
+ df = df.copy()
9
+
10
+ White = ['White Blend', 'Pinot Gris', 'Riesling', 'Chardonnay', 'Chenin Blanc', 'Sauvignon Blanc',
11
+ 'Viognier-Chardonnay', 'Catarratto', 'Inzolia', 'Bordeaux-style White Blend', 'Grillo',
12
+ 'Albariño', 'Petit Manseng', 'Vernaccia', 'Grüner Veltliner', 'Viognier', 'Vermentino',
13
+ 'Grenache Blanc', 'Pinot Blanc', 'Alsace white blend', 'Portuguese White', 'Verdejo',
14
+ 'Fumé Blanc', 'Pinot Bianco', 'Ugni Blanc-Colombard', 'Friulano', 'Assyrtico', 'Vignoles',
15
+ 'Muscat', 'Muscadelle', 'Garganega', 'Pinot Grigio','Cortese', 'Melon', 'Vidal', 'Verdelho',
16
+ 'Marsanne', 'Vilana', 'Viura', 'Verduzzo', 'Verdicchio', 'Colombard', 'Sylvaner', 'Sémillon',
17
+ 'Antão Vaz', 'Verdejo-Viura', 'Chenin Blanc-Chardonnay', 'Insolia', 'Ribolla Gialla',
18
+ 'Weissburgunder', 'Traminer', 'Prié Blanc', 'Müller-Thurgau', 'Pansa Blanca', 'Muskat Ottonel',
19
+ 'Sauvignon Blanc-Semillon', 'Semillon-Sauvignon Blanc', 'Bical', 'Viura-Chardonnay', 'Malvasia Bianca',
20
+ 'Rhône-style White Blend', 'Scheurebe', 'Kerner', 'Carricante', 'Fiano', 'Früburgunder', 'Roussanne',
21
+ 'Avesso', 'Chinuri', 'Muscat Blanc à Petits Grains', 'Xarel-lo', 'Greco', 'Trebbiano', 'Prié Blanc',
22
+ 'Falanghina', 'Bical', 'Gelber Muskateller', 'Turbiana', 'Refosco', 'Alvarinho', 'Manzoni', 'Assyrtiko',
23
+ 'Welschriesling', 'Rieslaner', 'Traminette', 'Marsanne-Viognier', 'Gewürztraminer-Riesling',
24
+ 'Austrian white blend', 'Tocai', 'Chardonnay-Viognier', 'Fernão Pires', 'Seyval Blanc', 'Muscat Canelli',
25
+ 'Arinto', 'Arneis', 'Malvasia', 'Altesse', 'Blanc du Bois', 'Provence white blend', 'Nosiola',
26
+ 'Roussanne-Viognier', 'Godello', 'Auxerrois', 'Albana', 'Muskat', 'Grechetto', 'Encruzado',
27
+ 'Garnacha Blanca', 'Pallagrello', 'Morava', 'Aleatico', 'Nascetta', 'Siria', 'Asprinio', 'Feteascǎ Regalǎ',
28
+ 'Tocai Friulano', 'Schiava', 'Chardonnay-Semillon', 'Palomino', 'Norton',
29
+ 'Loureiro-Arinto', 'Symphony', 'Edelzwicker', 'Madeira Blend', 'Gros and Petit Manseng', 'Jacquère',
30
+ 'Chenin Blanc-Sauvignon Blanc', 'Marzemino', 'Chardonnay-Sauvignon Blanc', 'Trebbiano Spoletino',
31
+ 'Chasselas', 'Hárslevelü', 'Siegerrebe','Colombard-Sauvignon Blanc', 'Diamond',
32
+ 'Gros Manseng', 'Muskateller', 'Aligoté', 'Muscat Blanc', 'Viognier-Roussanne', 'Pallagrello Bianco',
33
+ 'Veltliner', 'Chardonnay-Sauvignon', 'Chenin Blanc-Viognier', 'Vitovska', 'Grauburgunder', 'Macabeo',
34
+ 'Verdil', 'Treixadura', 'Coda di Volpe', 'Viura-Verdejo', 'Bombino Bianco', 'Pinot-Chardonnay',
35
+ "Muscat d'Alexandrie", 'Chardonnay-Pinot Gris', 'Chardonnay-Pinot Blanc','Piquepoul Blanc', 'Orange Muscat',
36
+ 'Ugni Blanc', 'Semillon-Chardonnay', 'Irsai Oliver', 'Greco Bianco', 'Viognier-Grenache Blanc', 'Pignoletto',
37
+ 'Muscatel', 'White Riesling', 'Hondarrabi Zuri', 'Nuragus', 'Xynisteri', 'Sauvignon Musqué', 'Roussanne-Marsanne',
38
+ 'Incrocio Manzoni', 'Terrantez', 'Bual', 'Verdejo-Sauvignon Blanc', 'Malvasia-Viura', 'Savatiano',
39
+ 'Macabeo-Chardonnay', 'Tamjanika', 'Macabeo-Moscatel', 'Códega do Larinho','Pinot Gris-Gewürztraminer',
40
+ 'Viosinho', 'Paralleda', 'Malvar', 'Airen', 'Erbaluce', 'Verdosilla', 'Aidani', 'Vinhão', 'Rolle', 'Orangetraube',
41
+ 'Žilavka', 'Portuguiser', 'Gouveio', 'Bombino Nero', 'Malagouzia-Chardonnay', 'Elbling', 'Gragnano',
42
+ 'Pinot Blanc-Chardonnay', 'Petit Meslier', 'Chardonnay Weissburgunder', 'Robola', 'Folle Blanche', 'Malagouzia',
43
+ 'Rabigato', 'Sauvignonasse', 'Meseguera', 'Alvarinho-Chardonnay', 'Pinot Blanc-Viognier', 'Biancu Gentile',
44
+ 'Xinisteri','Moschofilero-Chardonnay','Sauvignon Blanc-Sauvignon Gris', 'Trebbiano di Lugana', 'Verdeca',
45
+ 'Chardonel', 'Silvaner-Traminer', 'Uvalino', 'Merseguera-Sauvignon Blanc', 'Cayuga',
46
+ 'Nasco', 'Vital', 'Apple', 'Pinot Grigio-Sauvignon Blanc', 'Valvin Muscat', 'Malvasia Fina',
47
+ 'Roditis-Moschofilero', 'Premsal', 'Jampal', 'Tokay Pinot Gris', 'Trajadura', 'Roscetto', 'Torontel',
48
+ 'Viognier-Valdiguié',
49
+ 'Zierfandler', 'Marsanne-Roussanne', 'Pinot Meunier', 'Muskat Ottonel', 'Moscatel', 'Moschofilero', 'White Port',
50
+ 'Kisi', 'Kangoun', 'Posip', 'Uva di Troia', 'Zierfandler-Rotgipfler', 'Mauzac', 'Pinot Auxerrois', 'Neuburger',
51
+ 'Sämling', 'Rkatsiteli', 'Trousseau Gris', 'Malvasia Istriana', 'Morillon', 'Tokay', 'Gros Plant', 'Muscat Hamburg',
52
+ 'Emir', 'Tsolikouri', 'Narince', 'Grecanico', 'Madeleine Angevine', 'Doña Blanca', 'Graševina', 'Thrapsathiri',
53
+ 'Cococciola', 'Plyto', 'Azal', 'Moscatel Graúdo', 'Malvasia di Candia', 'Maria Gomes', 'Muscat of Alexandria',
54
+ 'Moscatel de Alejandría', 'Misket', 'Tamianka', 'Morio Muskat', 'Sauvignonasse',
55
+ 'Viognier-Marsanne', 'Ryzlink Rýnský', 'Muscadel', 'Roussanne-Grenache Blanc', 'Chancellor', 'Picapoll',
56
+ 'Blauburger', 'Athiri', 'Ondenc','Gewürztraminer', 'Torrontés', 'Furmint', 'Savagnin', 'Glera',
57
+ 'Roter Veltliner', 'Silvaner', 'Ruché', 'Pecorino', 'Sauvignon Gris', 'Vidal Blanc', 'Albanello',
58
+ 'Loureiro', 'Clairette', 'Verduzzo Friulano ', "Loin de l'Oeil", 'Timorasso', 'Pigato', 'Viognier-Gewürztraminer',
59
+ 'Sauvignon Blanc-Chenin Blanc', 'Colombard-Ugni Blanc', 'Mtsvane', 'Rivaner', 'Vespaiolo', 'Biancolella',
60
+ 'Riesling-Chardonnay', 'Maria Gomes-Bical', 'Gelber Traminer', 'Sercial', 'Grenache Gris', 'Chardonnay-Albariño',
61
+ 'Roditis', 'Papaskarasi', 'Zibibbo', 'Malagousia', 'Rotgipfler', 'Durella', 'Cercial', 'Johannisberg Riesling',
62
+ 'Teran', 'Mantonico', 'Timorasso', 'Zlahtina', 'Shiraz-Roussanne', 'Tămâioasă Românească', 'Ansonica', 'Feteasca',
63
+ 'Catalanesca', 'Moscato di Noto', 'Moscato Giallo','Sauvignon Blanc-Chardonnay', 'Sauvignon-Sémillon', "Cesanese d'Affile",
64
+ 'Sauvignon Blanc-Verdejo', 'Chardonnay-Riesling', 'Sauvignon Blanc-Assyrtiko','Zelen', 'Tempranillo Blanco',
65
+ 'Roter Traminer','Picpoul'
66
+ ]
67
+ Red = ['Portuguese Red', 'Pinot Noir', 'Tempranillo-Merlot', 'Frappato', 'Cabernet Sauvignon',
68
+ 'Nerello Mascalese', 'Malbec', 'Tempranillo Blend', 'Meritage', 'Red Blend', 'Merlot',
69
+ "Nero d'Avola", 'Gamay', 'Primitivo', 'Sangiovese', 'Cabernet Franc', 'Bordeaux-style Red Blend',
70
+ 'Aglianico', 'Petite Sirah', 'Touriga Nacional', 'Carmenère', 'Rosso', 'Shiraz-Cabernet Sauvignon',
71
+ 'Barbera', 'Rhône-style Red Blend', 'Graciano', 'Tannat-Cabernet', 'Sauvignon', 'Sangiovese Grosso',
72
+ 'Bonarda', 'Shiraz', 'Montepulciano', 'Grenache', 'Syrah', 'Nebbiolo', 'Blaufränkisch', 'Carignan-Grenache',
73
+ 'Sagrantino', 'Cabernet Sauvignon-Syrah', 'Tempranillo','Mencía', 'Zweigelt', 'Cannonau', 'Dolcetto',
74
+ 'Garnacha Tintorera', 'Pinot Nero', 'Pinotage', 'Syrah-Grenache', 'Antão Vaz', 'Cabernet Sauvignon-Carmenère',
75
+ 'Tinta Miúda', 'Monastrell', 'Merlot-Malbec', 'Cabernet Sauvignon-Merlot', 'Merlot-Argaman', 'Garnacha',
76
+ 'Negroamaro', 'Mourvèdre', 'Syrah-Cabernet', 'Tannat', 'Cabernet Sauvignon-Sangiovese', 'Austrian Red Blend',
77
+ 'Teroldego', 'Baga','Pinot Noir-Gamay', 'Cinsault', 'Corvina, Rondinella, Molinara', 'Tannat-Syrah', 'Charbono',
78
+ 'Provence red blend', 'Claret','Malbec-Merlot', 'Monastrell-Syrah', 'Malbec-Tannat', 'Malbec-Cabernet Franc',
79
+ 'Tinta de Toro', 'Cabernet Moravia', 'Chambourcin', 'Nero di Troia', 'Cesanese', 'Lagrein', 'Tinta Fina', 'St. Laurent',
80
+ 'Cabernet Sauvignon-Shiraz', 'Syrah-Cabernet Sauvignon', 'Pugnitello', 'Touriga Nacional Blend', 'Tinta Roriz',
81
+ 'Cabernet Franc-Cabernet Sauvignon', 'Grenache-Syrah', 'Tempranillo-Cabernet Sauvignon', 'Merlot-Cabernet Franc',
82
+ 'Syrah-Petite Sirah', 'Cabernet Blend', 'Maturana', 'Magliocco', 'Gamay Noir', 'Spätburgunder', 'Plavac Mali',
83
+ 'Lemberger', 'Saperavi', 'Dornfelder', 'Ojaleshi', 'Mondeuse', 'Perricone', 'Syrah-Merlot', 'Cabernet Sauvignon-Malbec',
84
+ 'Tinto Fino', 'Malbec-Cabernet Sauvignon','Carignano', 'Cabernet Franc-Merlot',
85
+ 'Syrah-Petit Verdot', 'Syrah-Mourvèdre', 'Shiraz-Grenache', 'Grenache-Carignan', 'Malbec-Syrah',
86
+ 'Cabernet Sauvignon-Tempranillo', 'Carignan', 'Cabernet-Syrah', 'Merlot-Cabernet Sauvignon',
87
+ 'Mourvèdre-Syrah', 'Negrette', 'Tinta Barroca', 'Merlot-Tannat','Castelão',
88
+ 'Grenache Blend', 'Sangiovese Cabernet', 'Touriga Nacional-Cabernet Sauvignon', 'Cabernet Sauvignon-Cabernet Franc',
89
+ 'Baco Noir', 'Tempranillo-Tannat', 'Touriga Franca', 'Barbera-Nebbiolo', 'Prieto Picudo', 'Gaglioppo', 'Carignane',
90
+ 'Tannat-Merlot', 'Nerello Cappuccio', 'Counoise', 'Mazuelo', 'Tinta del Pais', 'Vranec', 'Mavrud', 'Cabernet',
91
+ 'Grenache-Mourvèdre', 'Forcallà', 'Syrah-Tempranillo', 'Cabernet Sauvignon-Barbera', 'Merlot-Cabernet', 'Jaen',
92
+ 'Tinta del Toro', 'Prunelard', 'Garnacha-Syrah', 'Rufete', 'Tempranillo-Shiraz','Mansois',
93
+ 'Mataro', 'Tinta Cao', 'Blauer Portugieser', 'Groppello', 'Poulsard', 'Grenache-Shiraz', 'Baga-Touriga Nacional',
94
+ 'Carineña', 'Ciliegiolo', 'Cabernet Sauvignon-Merlot-Shiraz', 'Sciaccerellu', 'Alicante', 'Rosenmuskateller',
95
+ 'Malbec-Cabernet', 'Touriga', 'Carmenère-Syrah', 'Mavroudi', 'Pinot Blanc-Pinot Noir', 'Tinto Velasco', 'Kadarka',
96
+ 'Sangiovese-Syrah', 'Tannat-Cabernet Franc', 'Fer Servadou', 'Mission', 'Kekfrankos', 'Blauburgunder', 'Marquette',
97
+ 'Romorantin', 'Braucol', 'Cabernet Franc-Malbec', 'Pallagrello Nero', 'Rebula', 'Vespolina', 'Shiraz-Malbec',
98
+ 'Rebo', 'Tempranillo-Malbec', 'Trousseau', 'Bacchus', 'Syrah-Malbec', 'Syrah-Cabernet Franc', 'Cariñena-Garnacha',
99
+ 'Sideritis','Rara Neagra', 'Molinara', 'Abouriou', 'Nielluciu', 'Malbec-Bonarda', 'Garnacha-Monastrell', 'Souzao',
100
+ 'Tinta Francisca', 'Malvasia Nera', 'Listán Negro', 'Pinotage-Merlot', 'Jacquez', 'Carignan-Syrah', 'Mavrotragano',
101
+ 'Bovale', 'Frankovka', 'Garnacha Blend', 'Merlot-Shiraz', 'Malbec Blend', 'Merlot-Syrah', 'Babić', 'Yapincak',
102
+ 'Mandilaria', 'Saperavi-Merlot', 'Teroldego Rotaliano', 'Garnacha-Tempranillo','Vermentino Nero',
103
+ 'Albarossa', 'Cabernet Sauvignon Grenache', 'Black Monukka', 'Merlot-Grenache', 'Vranac', 'Tempranillo-Syrah',
104
+ 'Boğazkere', 'Tinta Amarela', 'Tinta Negra Mole', 'Chelois', 'Shiraz-Tempranillo', 'Biancale', 'Syrah-Bonarda',
105
+ 'Durif', 'Franconia', 'Malbec-Tempranillo', 'Monastrell-Petit Verdot', 'Sirica', 'Espadeiro', 'Blatina', 'Karalahna',
106
+ 'Garnacha-Cabernet', 'Garnacha-Cariñena', 'Cabernet Franc-Lemberger', 'Shiraz-Mourvèdre', 'Mavrokalavryta', 'Favorita',
107
+ 'Babosa Negro', 'Dafni', 'Petit Courbu', 'Kotsifali', 'Parraleta', 'Otskhanuri Sapere', 'Trollinger',
108
+ 'Tsapournakos', 'Francisa', 'Kuntra', 'Pignolo', 'Schwartzriesling','Sousão', 'Feteasca Neagra', 'Kinali Yapincak',
109
+ 'Kalecik Karasi', 'Karasakiz', 'Raboso', 'Trepat', 'Freisa', 'Trincadeira', 'Melnik', 'Argaman', 'Piedirosso',
110
+ 'Marawi', 'Çalkarası', 'Tinta Francisca', 'Vidadillo', 'Other', 'Cabernet Pfeffer', 'Roviello', 'Colorino',
111
+ 'Tinta Madeira', 'Centesimino', 'Ramisco', 'Gamza', 'Bobal-Cabernet Sauvignon',
112
+ 'Petit Verdot', 'Zinfandel', 'G-S-M', 'Monica', 'Cabernet Merlot', 'Cabernet Franc-Carmenère',
113
+ 'Grenache Noir', 'Xinomavro', 'Petite Verdot', 'Tempranillo-Garnacha', 'Carmenère-Cabernet Sauvignon',
114
+ 'Sangiovese-Cabernet Sauvignon', 'Shiraz-Cabernet', 'Syrah-Grenache-Viognier', 'Cabernet-Shiraz', 'Syrah-Carignan',
115
+ 'Cabernet-Malbec', 'Merlot-Petite Verdot', 'Duras', 'Aragonês', 'Agiorgitiko', 'Aragonez', 'Alfrocheiro', 'Corvina',
116
+ 'Alicante Bouschet', 'Tinto del Pais', 'Bobal', 'Susumaniello', 'Grolleau', 'Canaiolo', 'Bastardo', 'Tintilia',
117
+ 'St. Vincent', 'Caprettone','Black Muscat','Muscadine','Syrah-Viognier', 'Shiraz-Viognier', 'Carcajolu',
118
+ 'Marselan', 'Malbec-Petit Verdot', 'Grignolino', 'Pinot Noir-Syrah', 'Malbec-Carménère','País', 'Alvarelhão',
119
+ 'Okuzgozu', 'Tintilia','Mavrodaphne','Tintilia ',
120
+ ]
121
+
122
+ Rosé = ['Rosé', 'Rosato', 'Rosado','Portuguese Rosé', 'Prugnolo Gentile']
123
+
124
+ Sparkling = ['Champagne Blend', 'Prosecco', 'Sparkling Blend','Portuguese Sparkling',
125
+ 'Cerceal', 'Lambrusco','Lambrusco di Sorbara','Lambrusco Grasparossa',
126
+ 'Torbato', 'Moscadello', 'Passerina', 'Brachetto', 'Ekigaïna', 'Picolit',
127
+ 'Sacy', 'Moscatel Roxo', 'Debit','Moscato', 'Valdiguié', 'Casavecchia',
128
+ 'Lambrusco Salamino', 'Moscato Rosa']
129
+
130
+ Fortified = ['Sherry', 'Pedro Ximénez', 'White Port', 'Tokaji','Port']
131
+
132
+ red_dict = {variety: 'Red' for variety in Red}
133
+ white_dict = {variety: 'White' for variety in White}
134
+ rose_dict = {variety: 'Rosé' for variety in Rosé}
135
+ sparkling_dict = {variety: 'Sparkling' for variety in Sparkling}
136
+ fortified_dict = {variety: 'Fortified' for variety in Fortified}
137
+ wine_dict = {**red_dict, **white_dict, **rose_dict, **sparkling_dict, **fortified_dict}
138
+
139
+ # Remove duplicates
140
+ df = df.drop_duplicates(subset='description', keep="first")
141
+
142
+ # Apply wine type dict map
143
+ df['type'] = df['variety'].map(wine_dict)
144
+
145
+ # Fix one missing value:
146
+ df['type'].fillna('Red', inplace=True)
147
+
148
+ # Rename cols
149
+ df = df.rename(columns={'country':'Country',
150
+ 'description':'Tasting notes',
151
+ 'designation':'Designation',
152
+ 'points':'Score',
153
+ 'price': 'Price',
154
+ 'province':'Province',
155
+ 'region_1': 'Region',
156
+ 'title':'Title',
157
+ 'variety':'Variety',
158
+ 'winery':'Winery',
159
+ 'embeddings':'embeddings',
160
+ 'type':'Type'})
161
+
162
+ return df