Spaces:

GIZ
/

embedding_visualisation

Running

File size: 2,183 Bytes

f11912b
f68327f
 
 
f11912b
4f7df78
f68327f
aae2963
 
 
 
 
 
 
 
 
f68327f
 
 
 
 
 
3ddbd4c
f68327f
3ddbd4c
f68327f
aae2963
f68327f
 
35c4462
aae2963
35c4462
aae2963
35c4462
6ed36a2
35c4462
65131af

import streamlit as st
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sentence_transformers import SentenceTransformer

def app():
    st.title("Text Similarity")
    with st.expander("ℹ️ - About this app", expanded=True):

        st.write(
            """     
            Information cartography - Get your word/phrase/sentence/paragraph embedded and visualized.
            The (English) sentence-transformers model "all-MiniLM-L6-v2" maps sentences & paragraphs to a 384 dimensional dense vector space This is normally used for tasks like clustering or semantic search, but in this case, we use it to calculate the (cosine) similarity. The sentence transformer is context sensitive and works best with whole sentences, to account for that we extend your text with "The book is about <text>" if its less than 15 characters.
            
            Simply put in your text and press COMPARE, the higher the similarity the closer the text in the embedding space (max 1).
            """)
    
    model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
    
    with st.container():
        col1, col2 = st.columns(2)
        with col1:
            word_to_embed1 = st.text_input("Text 1", value="The most vulnerable countries are seeking ‘loss and damage’ compensation from the biggest polluters.",)
        with col2:                                 
            word_to_embed2 = st.text_input("Text 2", value="COP27 opens amid compounding crises of war, warming and economic instability.",)
            
    if st.button("Comapre"):
        with st.spinner("Embedding comparing  your inputs"):
            
            document = [word_to_embed1 ,word_to_embed2]
            documents_embed = ["The book is about "+ wte for wte in document if len(wte) <15]
            #Encode paragraphs
            document_embeddings = model.encode(documents_embed , show_progress_bar=False)
            #Compute cosine similarity between labels sentences and paragraphs
            similarity_matrix = cosine_similarity(document_embeddings)
            
            st.write("Text similarity:", round(similarity_matrix[0][1]*100,2),"%")