company_name_matches_using_embeddings

Runtime error

App Files Files Community

company_name_matches_using_embeddings / app.py

amoldwalunj

Update app.py

9a46289 over 1 year ago

raw

history blame

No virus

3.57 kB

	import streamlit as st
	import pandas as pd
	import json
	import numpy as np
	from fuzzywuzzy import fuzz

	import pinecone
	from sentence_transformers import SentenceTransformer

	pinecone.init(api_key='72677043-918a-4a15-9077-9c5b3cc40df9', environment='us-west4-gcp')

	model = SentenceTransformer('all-mpnet-base-v2',device='cpu')

	def process_string(s):
	return s.lower().replace('&', 'and')

	def levenshtein_distance(s1, s2):
	return fuzz.ratio(s1, s2)

	def compare_string_all(string, df):
	string = string.lower().replace('&', 'and')

	df['distance'] = df['cleaned_text'].apply(lambda x: levenshtein_distance(string, x.lower()))

	top_5_df = df.sort_values('distance', ascending=False).head(5)

	top_5_df = top_5_df[['label','Ingredients', 'distance']]

	return top_5_df

	def compare_string_label(string, df):
	string = string.lower().replace('&', 'and')

	df['distance'] = df['cleaned_label'].apply(lambda x: levenshtein_distance(string, x.lower()))

	top_5_df = df.sort_values('distance', ascending=False).head(5)

	top_5_df = top_5_df[['label','Ingredients', 'distance']]

	return top_5_df

	df= pd.read_json('cleaned.json')

	df['label+ingradient'] = df['label'] + ' : ' + df['Ingredients']

	df['cleaned_text']= df['label+ingradient'].apply(process_string)

	df['cleaned_label'] = df['label'].apply(process_string)

	index = pinecone.Index('companiessearch')


	# Create a Streamlit app
	def main():
	st.set_page_config(page_title="String Matching App", page_icon=":smiley:", layout="wide")
	st.title("Company name matching App :smiley:")

	# Define pages
	pages = ["Semantic search"]

	# Add radio buttons to toggle between pages
	page = st.sidebar.radio("Select a page", pages)

	# if page == pages[0]:
	# st.header("Matches using levenshtein_distance")
	# st.write("Enter a menu along with its ingredients:")
	# st.write("e.g. Pita & HUMMUS Garlic Hummus, crispy seasoned pita")
	# input_string = st.text_input("")

	# input_string= process_string(input_string)

	# if input_string:
	# st.write("Top 5 matches:")

	# if len(input_string.split())>4:
	# top_matches = compare_string_all(input_string, df)
	# else:
	# top_matches= compare_string_label(input_string, df)

	# st.dataframe(top_matches)

	if page == pages[0]:
	st.header("Matches using embeddings (semantic search)")
	st.write("Enter a company name:")
	st.write("e.g. Airtel Africa Plc")
	input_string = st.text_input("")

	input_string = process_string(input_string)

	if st.button("Enter"):
	st.write("Top 5 matches using semantic search:")

	# if len(input_string.split()) > 4:
	# top_matches = compare_string_all(input_string, df)
	# else:
	# top_matches = compare_string_label(input_string, df)

	xq = model.encode([input_string]).tolist()
	result = index.query(xq, top_k=10, includeMetadata=True)

	Name=[]
	Country=[]
	score=[]
	for matches in result['matches']:
	Name.append(matches['metadata']['name'])
	Country.append(matches['metadata']['Country'])
	score.append(matches['score'])

	final_result= pd.DataFrame(list(zip(Name, Country, score)),
	columns =['Company_name', 'Country','score' ])

	st.dataframe(final_result)

	if __name__ == "__main__":
	main()