amoldwalunj commited on
Commit
0776ebc
1 Parent(s): 8e6bf02

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +115 -0
app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import json
4
+ import numpy as np
5
+ from fuzzywuzzy import fuzz
6
+
7
+ import pinecone
8
+ from sentence_transformers import SentenceTransformer
9
+
10
+ pinecone.init(api_key='5c5b5687-b73d-47e9-9cc8-e184ff72cc45', environment='us-central1-gcp')
11
+
12
+ model = SentenceTransformer('all-mpnet-base-v2',device='cpu')
13
+
14
+ def process_string(s):
15
+ return s.lower().replace('&', 'and')
16
+
17
+ def levenshtein_distance(s1, s2):
18
+ return fuzz.ratio(s1, s2)
19
+
20
+ def compare_string_all(string, df):
21
+ string = string.lower().replace('&', 'and')
22
+
23
+ df['distance'] = df['cleaned_text'].apply(lambda x: levenshtein_distance(string, x.lower()))
24
+
25
+ top_5_df = df.sort_values('distance', ascending=False).head(5)
26
+
27
+ top_5_df = top_5_df[['label','Ingredients', 'distance']]
28
+
29
+ return top_5_df
30
+
31
+ def compare_string_label(string, df):
32
+ string = string.lower().replace('&', 'and')
33
+
34
+ df['distance'] = df['cleaned_label'].apply(lambda x: levenshtein_distance(string, x.lower()))
35
+
36
+ top_5_df = df.sort_values('distance', ascending=False).head(5)
37
+
38
+ top_5_df = top_5_df[['label','Ingredients', 'distance']]
39
+
40
+ return top_5_df
41
+
42
+ df= pd.read_json('cleaned.json')
43
+
44
+ df['label+ingradient'] = df['label'] + ' : ' + df['Ingredients']
45
+
46
+ df['cleaned_text']= df['label+ingradient'].apply(process_string)
47
+
48
+ df['cleaned_label'] = df['label'].apply(process_string)
49
+
50
+ index = pinecone.Index('menuingradientsearch')
51
+
52
+
53
+ # Create a Streamlit app
54
+ def main():
55
+ st.set_page_config(page_title="String Matching App", page_icon=":smiley:", layout="wide")
56
+ st.title("String Matching App :smiley:")
57
+
58
+ # Define pages
59
+ pages = ["Fuzzy match", "Semantic search"]
60
+
61
+ # Add radio buttons to toggle between pages
62
+ page = st.sidebar.radio("Select a page", pages)
63
+
64
+ if page == pages[0]:
65
+ st.header("Matches using levenshtein_distance")
66
+ st.write("Enter a menu along with its ingredients:")
67
+ st.write("e.g. Pita & HUMMUS Garlic Hummus, crispy seasoned pita")
68
+ input_string = st.text_input("")
69
+
70
+ input_string= process_string(input_string)
71
+
72
+ if input_string:
73
+ st.write("Top 5 matches:")
74
+
75
+ if len(input_string.split())>4:
76
+ top_matches = compare_string_all(input_string, df)
77
+ else:
78
+ top_matches= compare_string_label(input_string, df)
79
+
80
+ st.dataframe(top_matches)
81
+
82
+ elif page == pages[1]:
83
+ st.header("Matches using embeddings (semantic search)")
84
+ st.write("Enter a menu along with its ingredients:")
85
+ st.write("e.g. Pita & HUMMUS Garlic Hummus, crispy seasoned pita")
86
+ input_string = st.text_input("")
87
+
88
+ input_string = process_string(input_string)
89
+
90
+ if input_string:
91
+ st.write("Top 10 matches using semantic search:")
92
+
93
+ # if len(input_string.split()) > 4:
94
+ # top_matches = compare_string_all(input_string, df)
95
+ # else:
96
+ # top_matches = compare_string_label(input_string, df)
97
+
98
+ xq = model.encode([input_string]).tolist()
99
+ result = index.query(xq, top_k=10, includeMetadata=True)
100
+
101
+ labels=[]
102
+ ingradients=[]
103
+ score=[]
104
+ for matches in result['matches']:
105
+ labels.append(matches['metadata']['label'])
106
+ ingradients.append(matches['metadata']['Ingredients'])
107
+ score.append(matches['score'])
108
+
109
+ final_result= pd.DataFrame(list(zip(labels, ingradients, score)),
110
+ columns =['labels', 'ingradients','score' ])
111
+
112
+ st.dataframe(final_result)
113
+
114
+ if __name__ == "__main__":
115
+ main()