import streamlit as st import pandas as pd import json import numpy as np from fuzzywuzzy import fuzz import pinecone from sentence_transformers import SentenceTransformer pinecone.init(api_key='5c5b5687-b73d-47e9-9cc8-e184ff72cc45', environment='us-central1-gcp') model = SentenceTransformer('all-mpnet-base-v2',device='cpu') def process_string(s): return s.lower().replace('&', 'and') def levenshtein_distance(s1, s2): return fuzz.ratio(s1, s2) def compare_string_all(string, df): string = string.lower().replace('&', 'and') df['distance'] = df['cleaned_text'].apply(lambda x: levenshtein_distance(string, x.lower())) top_5_df = df.sort_values('distance', ascending=False).head(5) top_5_df = top_5_df[['label','Ingredients', 'distance']] return top_5_df def compare_string_label(string, df): string = string.lower().replace('&', 'and') df['distance'] = df['cleaned_label'].apply(lambda x: levenshtein_distance(string, x.lower())) top_5_df = df.sort_values('distance', ascending=False).head(5) top_5_df = top_5_df[['label','Ingredients', 'distance']] return top_5_df df= pd.read_json('cleaned.json') df['label+ingradient'] = df['label'] + ' : ' + df['Ingredients'] df['cleaned_text']= df['label+ingradient'].apply(process_string) df['cleaned_label'] = df['label'].apply(process_string) index = pinecone.Index('menuingradientsearch') # Create a Streamlit app def main(): st.set_page_config(page_title="String Matching App", page_icon=":smiley:", layout="wide") st.title("String Matching App :smiley:") # Define pages pages = ["Fuzzy match", "Semantic search"] # Add radio buttons to toggle between pages page = st.sidebar.radio("Select a page", pages) if page == pages[0]: st.header("Matches using levenshtein_distance") st.write("Enter a menu along with its ingredients:") st.write("e.g. Pita & HUMMUS Garlic Hummus, crispy seasoned pita") input_string = st.text_input("") input_string= process_string(input_string) if input_string: st.write("Top 5 matches:") if len(input_string.split())>4: top_matches = compare_string_all(input_string, df) else: top_matches= compare_string_label(input_string, df) st.dataframe(top_matches) elif page == pages[1]: st.header("Matches using embeddings (semantic search)") st.write("Enter a menu along with its ingredients:") st.write("e.g. Pita & HUMMUS Garlic Hummus, crispy seasoned pita") input_string = st.text_input("") input_string = process_string(input_string) if input_string: st.write("Top 10 matches using semantic search:") # if len(input_string.split()) > 4: # top_matches = compare_string_all(input_string, df) # else: # top_matches = compare_string_label(input_string, df) xq = model.encode([input_string]).tolist() result = index.query(xq, top_k=10, includeMetadata=True) labels=[] ingradients=[] score=[] for matches in result['matches']: labels.append(matches['metadata']['label']) ingradients.append(matches['metadata']['Ingredients']) score.append(matches['score']) final_result= pd.DataFrame(list(zip(labels, ingradients, score)), columns =['labels', 'ingradients','score' ]) st.dataframe(final_result) if __name__ == "__main__": main()