amoldwalunj's picture
Update app.py
9a46289
raw
history blame
No virus
3.57 kB
import streamlit as st
import pandas as pd
import json
import numpy as np
from fuzzywuzzy import fuzz
import pinecone
from sentence_transformers import SentenceTransformer
pinecone.init(api_key='72677043-918a-4a15-9077-9c5b3cc40df9', environment='us-west4-gcp')
model = SentenceTransformer('all-mpnet-base-v2',device='cpu')
def process_string(s):
return s.lower().replace('&', 'and')
def levenshtein_distance(s1, s2):
return fuzz.ratio(s1, s2)
def compare_string_all(string, df):
string = string.lower().replace('&', 'and')
df['distance'] = df['cleaned_text'].apply(lambda x: levenshtein_distance(string, x.lower()))
top_5_df = df.sort_values('distance', ascending=False).head(5)
top_5_df = top_5_df[['label','Ingredients', 'distance']]
return top_5_df
def compare_string_label(string, df):
string = string.lower().replace('&', 'and')
df['distance'] = df['cleaned_label'].apply(lambda x: levenshtein_distance(string, x.lower()))
top_5_df = df.sort_values('distance', ascending=False).head(5)
top_5_df = top_5_df[['label','Ingredients', 'distance']]
return top_5_df
df= pd.read_json('cleaned.json')
df['label+ingradient'] = df['label'] + ' : ' + df['Ingredients']
df['cleaned_text']= df['label+ingradient'].apply(process_string)
df['cleaned_label'] = df['label'].apply(process_string)
index = pinecone.Index('companiessearch')
# Create a Streamlit app
def main():
st.set_page_config(page_title="String Matching App", page_icon=":smiley:", layout="wide")
st.title("Company name matching App :smiley:")
# Define pages
pages = ["Semantic search"]
# Add radio buttons to toggle between pages
page = st.sidebar.radio("Select a page", pages)
# if page == pages[0]:
# st.header("Matches using levenshtein_distance")
# st.write("Enter a menu along with its ingredients:")
# st.write("e.g. Pita & HUMMUS Garlic Hummus, crispy seasoned pita")
# input_string = st.text_input("")
# input_string= process_string(input_string)
# if input_string:
# st.write("Top 5 matches:")
# if len(input_string.split())>4:
# top_matches = compare_string_all(input_string, df)
# else:
# top_matches= compare_string_label(input_string, df)
# st.dataframe(top_matches)
if page == pages[0]:
st.header("Matches using embeddings (semantic search)")
st.write("Enter a company name:")
st.write("e.g. Airtel Africa Plc")
input_string = st.text_input("")
input_string = process_string(input_string)
if st.button("Enter"):
st.write("Top 5 matches using semantic search:")
# if len(input_string.split()) > 4:
# top_matches = compare_string_all(input_string, df)
# else:
# top_matches = compare_string_label(input_string, df)
xq = model.encode([input_string]).tolist()
result = index.query(xq, top_k=10, includeMetadata=True)
Name=[]
Country=[]
score=[]
for matches in result['matches']:
Name.append(matches['metadata']['name'])
Country.append(matches['metadata']['Country'])
score.append(matches['score'])
final_result= pd.DataFrame(list(zip(Name, Country, score)),
columns =['Company_name', 'Country','score' ])
st.dataframe(final_result)
if __name__ == "__main__":
main()