import streamlit as st import pandas as pd import json import numpy as np from fuzzywuzzy import fuzz import pinecone from sentence_transformers import SentenceTransformer pinecone.init(api_key='72677043-918a-4a15-9077-9c5b3cc40df9', environment='us-west4-gcp') model = SentenceTransformer('all-mpnet-base-v2',device='cpu') def process_string(s): return s.lower().replace('&', 'and') def levenshtein_distance(s1, s2): return fuzz.ratio(s1, s2) def compare_string_all(string, df): string = string.lower().replace('&', 'and') df['distance'] = df['cleaned_text'].apply(lambda x: levenshtein_distance(string, x.lower())) top_5_df = df.sort_values('distance', ascending=False).head(5) top_5_df = top_5_df[['label','Ingredients', 'distance']] return top_5_df def compare_string_label(string, df): string = string.lower().replace('&', 'and') df['distance'] = df['cleaned_label'].apply(lambda x: levenshtein_distance(string, x.lower())) top_5_df = df.sort_values('distance', ascending=False).head(5) top_5_df = top_5_df[['label','Ingredients', 'distance']] return top_5_df df= pd.read_json('cleaned.json') df['label+ingradient'] = df['label'] + ' : ' + df['Ingredients'] df['cleaned_text']= df['label+ingradient'].apply(process_string) df['cleaned_label'] = df['label'].apply(process_string) index = pinecone.Index('companiessearch') # Create a Streamlit app def main(): st.set_page_config(page_title="String Matching App", page_icon=":smiley:", layout="wide") st.title("Company name matching App :smiley:") # Define pages pages = ["Semantic search"] # Add radio buttons to toggle between pages page = st.sidebar.radio("Select a page", pages) # if page == pages[0]: # st.header("Matches using levenshtein_distance") # st.write("Enter a menu along with its ingredients:") # st.write("e.g. Pita & HUMMUS Garlic Hummus, crispy seasoned pita") # input_string = st.text_input("") # input_string= process_string(input_string) # if input_string: # st.write("Top 5 matches:") # if len(input_string.split())>4: # top_matches = compare_string_all(input_string, df) # else: # top_matches= compare_string_label(input_string, df) # st.dataframe(top_matches) if page == pages[0]: st.header("Matches using embeddings (semantic search)") st.write("Enter a company name:") st.write("e.g. Airtel Africa Plc") input_string = st.text_input("") input_string = process_string(input_string) if st.button("Enter"): st.write("Top 5 matches using semantic search:") # if len(input_string.split()) > 4: # top_matches = compare_string_all(input_string, df) # else: # top_matches = compare_string_label(input_string, df) xq = model.encode([input_string]).tolist() result = index.query(xq, top_k=10, includeMetadata=True) Name=[] Country=[] score=[] for matches in result['matches']: Name.append(matches['metadata']['name']) Country.append(matches['metadata']['Country']) score.append(matches['score']) final_result= pd.DataFrame(list(zip(Name, Country, score)), columns =['Company_name', 'Country','score' ]) st.dataframe(final_result) if __name__ == "__main__": main()