Spaces:

Jerry0203
/

sentence_embedding

Runtime error

File size: 7,601 Bytes

import gradio as gr
from tqdm.autonotebook import tqdm
import ast
import nltk
from sentence_transformers import SentenceTransformer, util
import numpy as np
import os
import matplotlib.pyplot as plt
from PIL import Image
import glob
import torch
import pickle
import zipfile
from scipy.sparse.csgraph import connected_components
from scipy.special import softmax
import logging
import re

logger = logging.getLogger(__name__)

def degree_centrality_scores(
    similarity_matrix,
    threshold=None,
    increase_power=True,
):
    if not (
        threshold is None
        or isinstance(threshold, float)
        and 0 <= threshold < 1
    ):
        raise ValueError(
            '\'threshold\' should be a floating-point number '
            'from the interval [0, 1) or None',
        )

    if threshold is None:
        markov_matrix = create_markov_matrix(similarity_matrix)

    else:
        markov_matrix = create_markov_matrix_discrete(
            similarity_matrix,
            threshold,
        )

    scores = stationary_distribution(
        markov_matrix,
        increase_power=increase_power,
        normalized=False,
    )

    return scores


def _power_method(transition_matrix, increase_power=True, max_iter=10000):
    eigenvector = np.ones(len(transition_matrix))

    if len(eigenvector) == 1:
        return eigenvector

    transition = transition_matrix.transpose()

    for _ in range(max_iter):
        eigenvector_next = np.dot(transition, eigenvector)

        if np.allclose(eigenvector_next, eigenvector):
            return eigenvector_next

        eigenvector = eigenvector_next

        if increase_power:
            transition = np.dot(transition, transition)

    logger.warning("Maximum number of iterations for power method exceeded without convergence!")
    return eigenvector_next


def connected_nodes(matrix):
    _, labels = connected_components(matrix)

    groups = []

    for tag in np.unique(labels):
        group = np.where(labels == tag)[0]
        groups.append(group)

    return groups


def create_markov_matrix(weights_matrix):
    n_1, n_2 = weights_matrix.shape
    if n_1 != n_2:
        raise ValueError('\'weights_matrix\' should be square')

    row_sum = weights_matrix.sum(axis=1, keepdims=True)

    # normalize probability distribution differently if we have negative transition values
    if np.min(weights_matrix) <= 0:
        return softmax(weights_matrix, axis=1)

    return weights_matrix / row_sum


def create_markov_matrix_discrete(weights_matrix, threshold):
    discrete_weights_matrix = np.zeros(weights_matrix.shape)
    ixs = np.where(weights_matrix >= threshold)
    discrete_weights_matrix[ixs] = 1

    return create_markov_matrix(discrete_weights_matrix)


def stationary_distribution(
    transition_matrix,
    increase_power=True,
    normalized=True,
):
    n_1, n_2 = transition_matrix.shape
    if n_1 != n_2:
        raise ValueError('\'transition_matrix\' should be square')

    distribution = np.zeros(n_1)

    grouped_indices = connected_nodes(transition_matrix)

    for group in grouped_indices:
        t_matrix = transition_matrix[np.ix_(group, group)]
        eigenvector = _power_method(t_matrix, increase_power=increase_power)
        distribution[group] = eigenvector

    if normalized:
        distribution /= n_1

    return distribution

def cut_sent(para):
    para = re.sub('([。！？\?])([^”’])', r"\1\n\2", para)
    para = re.sub('(\.{6})([^”’])', r"\1\n\2", para)
    para = re.sub('(\…{2})([^”’])', r"\1\n\2", para)
    para = re.sub('([。！？\?][”’])([^，。！？\?])', r'\1\n\2', para)
    para = para.rstrip()
    return para.split("\n")


def embed(document):
    model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
    sentences = cut_sent(document)
    embeddings = model.encode(sentences, convert_to_tensor=True)
    #Compute the pair-wise cosine similarities
    cos_scores = util.pytorch_cos_sim(embeddings, embeddings).cpu().numpy()
 
    #Compute the centrality for each sentence
    centrality_scores = degree_centrality_scores(cos_scores, threshold=None)
 
    #We argsort so that the first element is the sentence with the highest score
    most_central_sentence_indices = np.argsort(-centrality_scores)
    response = sentences[most_central_sentence_indices[0]]
    return response



def search(query):
    model = SentenceTransformer('clip-ViT-B-32-multilingual-v1')
    img_folder = 'photos/'
    if not os.path.exists(img_folder) or len(os.listdir(img_folder)) == 0:
        os.makedirs(img_folder, exist_ok=True)
    
        photo_filename = 'unsplash-25k-photos.zip'
        if not os.path.exists(photo_filename):   #Download dataset if does not exist
            util.http_get('http://sbert.net/datasets/'+photo_filename, photo_filename)
        
        #Extract all images
        with zipfile.ZipFile(photo_filename, 'r') as zf:
            for member in tqdm(zf.infolist(), desc='Extracting'):
                zf.extract(member, img_folder)

    emb_filename = 'unsplash-25k-photos-embeddings.pkl'
    if not os.path.exists(emb_filename):   #Download dataset if does not exist
        util.http_get('http://sbert.net/datasets/'+emb_filename, emb_filename)

    with open(emb_filename, 'rb') as fIn:
        img_names, img_emb = pickle.load(fIn)  

    query_emb = model.encode([query], convert_to_tensor=True, show_progress_bar=False)
    
    hits = util.semantic_search(query_emb, img_emb, top_k=1)[0]
    for hit in hits:
        return plt.imread(os.path.join(img_folder, img_names[hit['corpus_id']]))

def sentence_embedding(sentences):
    model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
    embeddings = model.encode(sentences)
    return embeddings

def sentence_sim(sentence1, sentence2):
    model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
    embedding1 = model.encode(sentence1)
    embedding2 = model.encode(sentence2)
    cos_scores = util.pytorch_cos_sim(embedding1, embedding2).cpu().numpy()
    return cos_scores[0][0]
    
with gr.Blocks() as demo:
    with gr.Tab("Text Summarization"):
        gr.Markdown("Give a long document, find the sentence that give a good and short summary of the content.")
        text_input = gr.Textbox(label="document")
        text_output = gr.Textbox(label="summatization")
        text_button = gr.Button("Summarize")
    with gr.Tab("Image Search"):
        gr.Markdown("Image search given a user query.")
        with gr.Row():
            image_input = gr.Textbox(label="query")
            image_output = gr.Image(label="image")
        image_button = gr.Button("Search")
    with gr.Tab("Sentence Embedding"):
        gr.Markdown("Embed the given sentence.")
        embed_input = gr.Textbox(label="sentence")
        embed_output = gr.Textbox(label="embedding")
        embed_button = gr.Button("Embed")
    with gr.Tab("Sentence Similarity"):
        gr.Markdown("Calculate the similarity of two sentences.")
        sim_input1 = gr.Textbox(label="sentence_1")
        sim_input2 = gr.Textbox(label="sentence_2")
        sim_output = gr.Textbox(label="similarity")
        sim_button = gr.Button("Calculate")

    text_button.click(embed, inputs=text_input, outputs=text_output)
    image_button.click(search, inputs=image_input, outputs=image_output)
    embed_button.click(sentence_embedding, inputs=embed_input, outputs=embed_output)
    sim_button.click(sentence_sim, inputs=[sim_input1, sim_input2], outputs=sim_output)
demo.launch()