## Import ## ---------------- import pandas as pd import streamlit as st from sentence_transformers import SentenceTransformer, util ## Init ## ---------------- # set config st.set_page_config(layout="wide", page_title="EmojiFinder 🕵") # load the summarization model (cache for faster loading) @st.cache(allow_output_mutation=True) def load_similarity_model(model_name='all-MiniLM-L6-v2'): # model = pipeline("summarization", model='sshleifer/distilbart-cnn-12-6') model = SentenceTransformer(model_name) # return the model return model # list of supported models supported_models = ['all-MiniLM-L6-v2', 'paraphrase-albert-small-v2', 'paraphrase-MiniLM-L3-v2', 'all-distilroberta-v1', 'all-mpnet-base-v2'] # read the emoji df and extract the relevant columns emoji_df = pd.read_csv('EmojiCharts_unicodeorg.csv')[['name', 'codepoints']] # function to encode and decode the emoji text def encode_emoji(emoji): emoji_text = "" emoji = emoji.replace("U+", "") if len(emoji) == 4: emoji_text = f"\\U0000{emoji}" elif len(emoji) == 5: emoji_text = f"\\U000{emoji}" return emoji_text.encode().decode('unicode-escape') # function to find the top similar sentences def find_similar_sentences(query, target_sentences, n=5): # compute embeddings embeddings_query = model.encode([query], convert_to_tensor=True) embeddings_target = model.encode(target_sentences, convert_to_tensor=True) # compute cosine-similarities for each sentence with each other sentence cosine_scores = util.pytorch_cos_sim(embeddings_query, embeddings_target) # return the index of top 5 values in a list score_list = cosine_scores.tolist()[0] top_indices = sorted(range(len(score_list)), key=lambda i: score_list[i], reverse=True)[:n] return top_indices ## App Development ## ---------------- # settings selected_model_name = st.sidebar.selectbox('Similarity model', options=supported_models) emoji_count = st.sidebar.slider('Emoji output count', min_value=1, max_value=10, value=5, step=1) # title and headers st.title("EmojiFinder 🕵") st.markdown("Want to find the *most relevant* emoji for your text? **EmojiFinder** is here to help! 😎") query_text = st.text_area("Enter your text here: ") find_button = st.button("Help EmojiFinder!") # load the model model = load_similarity_model(selected_model_name) # callback with st.spinner("EmojiFinder is looking for clues to find the best emoji...."): if find_button: # fidn the top N similar sentences top_indices = find_similar_sentences(query_text, emoji_df['name'], emoji_count) # print the emojis for i in top_indices: emoji = emoji_df.iloc[i] # prep the text text = f'{emoji["name"]} - ' # add all of the codepoints text += ' '.join([encode_emoji(x) for x in emoji['codepoints'].split(' ')]) st.write(text)