amoldwalunj commited on
Commit
819f923
0 Parent(s):

Duplicate from amoldwalunj/matches_using_lavenstein_and_embeddings

Browse files
Files changed (5) hide show
  1. .gitattributes +35 -0
  2. README.md +13 -0
  3. app.py +115 -0
  4. cleaned.json +3 -0
  5. requirements.txt +6 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tflite filter=lfs diff=lfs merge=lfs -text
29
+ *.tgz filter=lfs diff=lfs merge=lfs -text
30
+ *.wasm filter=lfs diff=lfs merge=lfs -text
31
+ *.xz filter=lfs diff=lfs merge=lfs -text
32
+ *.zip filter=lfs diff=lfs merge=lfs -text
33
+ *.zst filter=lfs diff=lfs merge=lfs -text
34
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
35
+ cleaned.json filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Matches Using Lavenstein And Embeddings
3
+ emoji: 🦀
4
+ colorFrom: green
5
+ colorTo: blue
6
+ sdk: streamlit
7
+ sdk_version: 1.17.0
8
+ app_file: app.py
9
+ pinned: false
10
+ duplicated_from: amoldwalunj/matches_using_lavenstein_and_embeddings
11
+ ---
12
+
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import json
4
+ import numpy as np
5
+ from fuzzywuzzy import fuzz
6
+
7
+ import pinecone
8
+ from sentence_transformers import SentenceTransformer
9
+
10
+ pinecone.init(api_key='5c5b5687-b73d-47e9-9cc8-e184ff72cc45', environment='us-central1-gcp')
11
+
12
+ model = SentenceTransformer('all-mpnet-base-v2',device='cpu')
13
+
14
+ def process_string(s):
15
+ return s.lower().replace('&', 'and')
16
+
17
+ def levenshtein_distance(s1, s2):
18
+ return fuzz.ratio(s1, s2)
19
+
20
+ def compare_string_all(string, df):
21
+ string = string.lower().replace('&', 'and')
22
+
23
+ df['distance'] = df['cleaned_text'].apply(lambda x: levenshtein_distance(string, x.lower()))
24
+
25
+ top_5_df = df.sort_values('distance', ascending=False).head(5)
26
+
27
+ top_5_df = top_5_df[['label','Ingredients', 'distance']]
28
+
29
+ return top_5_df
30
+
31
+ def compare_string_label(string, df):
32
+ string = string.lower().replace('&', 'and')
33
+
34
+ df['distance'] = df['cleaned_label'].apply(lambda x: levenshtein_distance(string, x.lower()))
35
+
36
+ top_5_df = df.sort_values('distance', ascending=False).head(5)
37
+
38
+ top_5_df = top_5_df[['label','Ingredients', 'distance']]
39
+
40
+ return top_5_df
41
+
42
+ df= pd.read_json('cleaned.json')
43
+
44
+ df['label+ingradient'] = df['label'] + ' : ' + df['Ingredients']
45
+
46
+ df['cleaned_text']= df['label+ingradient'].apply(process_string)
47
+
48
+ df['cleaned_label'] = df['label'].apply(process_string)
49
+
50
+ index = pinecone.Index('menuingradientsearch')
51
+
52
+
53
+ # Create a Streamlit app
54
+ def main():
55
+ st.set_page_config(page_title="String Matching App", page_icon=":smiley:", layout="wide")
56
+ st.title("String Matching App :smiley:")
57
+
58
+ # Define pages
59
+ pages = ["Fuzzy match", "Semantic search"]
60
+
61
+ # Add radio buttons to toggle between pages
62
+ page = st.sidebar.radio("Select a page", pages)
63
+
64
+ if page == pages[0]:
65
+ st.header("Matches using levenshtein_distance")
66
+ st.write("Enter a menu along with its ingredients:")
67
+ st.write("e.g. Pita & HUMMUS Garlic Hummus, crispy seasoned pita")
68
+ input_string = st.text_input("")
69
+
70
+ input_string= process_string(input_string)
71
+
72
+ if input_string:
73
+ st.write("Top 5 matches:")
74
+
75
+ if len(input_string.split())>4:
76
+ top_matches = compare_string_all(input_string, df)
77
+ else:
78
+ top_matches= compare_string_label(input_string, df)
79
+
80
+ st.dataframe(top_matches)
81
+
82
+ elif page == pages[1]:
83
+ st.header("Matches using embeddings (semantic search)")
84
+ st.write("Enter a menu along with its ingredients:")
85
+ st.write("e.g. Pita & HUMMUS Garlic Hummus, crispy seasoned pita")
86
+ input_string = st.text_input("")
87
+
88
+ input_string = process_string(input_string)
89
+
90
+ if input_string:
91
+ st.write("Top 10 matches using semantic search:")
92
+
93
+ # if len(input_string.split()) > 4:
94
+ # top_matches = compare_string_all(input_string, df)
95
+ # else:
96
+ # top_matches = compare_string_label(input_string, df)
97
+
98
+ xq = model.encode([input_string]).tolist()
99
+ result = index.query(xq, top_k=10, includeMetadata=True)
100
+
101
+ labels=[]
102
+ ingradients=[]
103
+ score=[]
104
+ for matches in result['matches']:
105
+ labels.append(matches['metadata']['label'])
106
+ ingradients.append(matches['metadata']['Ingredients'])
107
+ score.append(matches['score'])
108
+
109
+ final_result= pd.DataFrame(list(zip(labels, ingradients, score)),
110
+ columns =['labels', 'ingradients','score' ])
111
+
112
+ st.dataframe(final_result)
113
+
114
+ if __name__ == "__main__":
115
+ main()
cleaned.json ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ddd04e95a5a8e294fbdd489e20c2c0a13daa6befae0cad18252936b95167149b
3
+ size 11689865
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ streamlit
2
+ pandas
3
+ numpy
4
+ fuzzywuzzy
5
+ sentence-transformers
6
+ pinecone-client