File size: 2,971 Bytes
8d06109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4e79aa0
 
8d06109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
## Import
## ----------------
import pandas as pd
import streamlit as st
from sentence_transformers import SentenceTransformer, util

## Init
## ----------------
# set config
st.set_page_config(layout="wide", page_title="EmojiFinder πŸ•΅")

# load the summarization model (cache for faster loading)
@st.cache(allow_output_mutation=True)
def load_similarity_model(model_name='all-MiniLM-L6-v2'):
    # model = pipeline("summarization", model='sshleifer/distilbart-cnn-12-6')
    model = SentenceTransformer(model_name)
    # return the model
    return model

# list of supported models
supported_models = ['all-MiniLM-L6-v2', 'paraphrase-albert-small-v2', 'paraphrase-MiniLM-L3-v2', 'all-distilroberta-v1', 'all-mpnet-base-v2']

# read the emoji df and extract the relevant columns
emoji_df = pd.read_csv('EmojiCharts_unicodeorg.csv')[['name', 'codepoints']]

# function to encode and decode the emoji text
def encode_emoji(emoji):
    emoji_text = ""
    emoji = emoji.replace("U+", "")
    if len(emoji) == 4:
        emoji_text = f"\\U0000{emoji}"
    elif len(emoji) == 5:
        emoji_text = f"\\U000{emoji}"
    return emoji_text.encode().decode('unicode-escape')

# function to find the top similar sentences
def find_similar_sentences(query, target_sentences, n=5):
    # compute embeddings
    embeddings_query = model.encode([query], convert_to_tensor=True)
    embeddings_target = model.encode(target_sentences, convert_to_tensor=True)

    # compute cosine-similarities for each sentence with each other sentence
    cosine_scores = util.pytorch_cos_sim(embeddings_query, embeddings_target)

    # return the index of top 5 values in a list
    score_list = cosine_scores.tolist()[0]
    top_indices = sorted(range(len(score_list)), key=lambda i: score_list[i], reverse=True)[:n]

    return top_indices

## App Development
## ----------------

# settings
selected_model_name = st.sidebar.selectbox('Similarity model', options=supported_models) 
emoji_count = st.sidebar.slider('Emoji output count', min_value=1, max_value=10, value=5, step=1)

# title and headers
st.title("EmojiFinder πŸ•΅")
st.markdown("Want to find the *most relevant* emoji for your text? **EmojiFinder** is here to help! 😎")
query_text = st.text_area("Enter your text here: ", "I love walking on the beach")
find_button = st.button("EmojiFinder help!")

# load the model
model = load_similarity_model(selected_model_name)

# callback
with st.spinner("EmojiFinder is looking for clues to find the best emoji...."):
    if find_button:
        # fidn the top N similar sentences
        top_indices = find_similar_sentences(query_text, emoji_df['name'], emoji_count)
        # print the emojis
        for i in top_indices:
            emoji = emoji_df.iloc[i]
            # prep the text
            text = f'{emoji["name"]} - '
            # add all of the codepoints
            text += ' '.join([encode_emoji(x) for x in emoji['codepoints'].split(' ')])
            st.write(text)