Spaces:

Jerry0203
/

sentence_embedding

Runtime error

App Files Files Community

sentence_embedding / app.py

Jerry0203

Update app.py

7bd2349 about 2 years ago

raw

history blame contribute delete

7.6 kB

	import gradio as gr
	from tqdm.autonotebook import tqdm
	import ast
	import nltk
	from sentence_transformers import SentenceTransformer, util
	import numpy as np
	import os
	import matplotlib.pyplot as plt
	from PIL import Image
	import glob
	import torch
	import pickle
	import zipfile
	from scipy.sparse.csgraph import connected_components
	from scipy.special import softmax
	import logging
	import re

	logger = logging.getLogger(__name__)

	def degree_centrality_scores(
	similarity_matrix,
	threshold=None,
	increase_power=True,
	):
	if not (
	threshold is None
	or isinstance(threshold, float)
	and 0 <= threshold < 1
	):
	raise ValueError(
	'\'threshold\' should be a floating-point number '
	'from the interval [0, 1) or None',
	)

	if threshold is None:
	markov_matrix = create_markov_matrix(similarity_matrix)

	else:
	markov_matrix = create_markov_matrix_discrete(
	similarity_matrix,
	threshold,
	)

	scores = stationary_distribution(
	markov_matrix,
	increase_power=increase_power,
	normalized=False,
	)

	return scores


	def _power_method(transition_matrix, increase_power=True, max_iter=10000):
	eigenvector = np.ones(len(transition_matrix))

	if len(eigenvector) == 1:
	return eigenvector

	transition = transition_matrix.transpose()

	for _ in range(max_iter):
	eigenvector_next = np.dot(transition, eigenvector)

	if np.allclose(eigenvector_next, eigenvector):
	return eigenvector_next

	eigenvector = eigenvector_next

	if increase_power:
	transition = np.dot(transition, transition)

	logger.warning("Maximum number of iterations for power method exceeded without convergence!")
	return eigenvector_next


	def connected_nodes(matrix):
	_, labels = connected_components(matrix)

	groups = []

	for tag in np.unique(labels):
	group = np.where(labels == tag)[0]
	groups.append(group)

	return groups


	def create_markov_matrix(weights_matrix):
	n_1, n_2 = weights_matrix.shape
	if n_1 != n_2:
	raise ValueError('\'weights_matrix\' should be square')

	row_sum = weights_matrix.sum(axis=1, keepdims=True)

	# normalize probability distribution differently if we have negative transition values
	if np.min(weights_matrix) <= 0:
	return softmax(weights_matrix, axis=1)

	return weights_matrix / row_sum


	def create_markov_matrix_discrete(weights_matrix, threshold):
	discrete_weights_matrix = np.zeros(weights_matrix.shape)
	ixs = np.where(weights_matrix >= threshold)
	discrete_weights_matrix[ixs] = 1

	return create_markov_matrix(discrete_weights_matrix)


	def stationary_distribution(
	transition_matrix,
	increase_power=True,
	normalized=True,
	):
	n_1, n_2 = transition_matrix.shape
	if n_1 != n_2:
	raise ValueError('\'transition_matrix\' should be square')

	distribution = np.zeros(n_1)

	grouped_indices = connected_nodes(transition_matrix)

	for group in grouped_indices:
	t_matrix = transition_matrix[np.ix_(group, group)]
	eigenvector = _power_method(t_matrix, increase_power=increase_power)
	distribution[group] = eigenvector

	if normalized:
	distribution /= n_1

	return distribution

	def cut_sent(para):
	para = re.sub('([。！？\?])([^”’])', r"\1\n\2", para)
	para = re.sub('(\.{6})([^”’])', r"\1\n\2", para)
	para = re.sub('(\…{2})([^”’])', r"\1\n\2", para)
	para = re.sub('([。！？\?][”’])([^，。！？\?])', r'\1\n\2', para)
	para = para.rstrip()
	return para.split("\n")


	def embed(document):
	model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
	sentences = cut_sent(document)
	embeddings = model.encode(sentences, convert_to_tensor=True)
	#Compute the pair-wise cosine similarities
	cos_scores = util.pytorch_cos_sim(embeddings, embeddings).cpu().numpy()

	#Compute the centrality for each sentence
	centrality_scores = degree_centrality_scores(cos_scores, threshold=None)

	#We argsort so that the first element is the sentence with the highest score
	most_central_sentence_indices = np.argsort(-centrality_scores)
	response = sentences[most_central_sentence_indices[0]]
	return response



	def search(query):
	model = SentenceTransformer('clip-ViT-B-32-multilingual-v1')
	img_folder = 'photos/'
	if not os.path.exists(img_folder) or len(os.listdir(img_folder)) == 0:
	os.makedirs(img_folder, exist_ok=True)

	photo_filename = 'unsplash-25k-photos.zip'
	if not os.path.exists(photo_filename): #Download dataset if does not exist
	util.http_get('http://sbert.net/datasets/'+photo_filename, photo_filename)

	#Extract all images
	with zipfile.ZipFile(photo_filename, 'r') as zf:
	for member in tqdm(zf.infolist(), desc='Extracting'):
	zf.extract(member, img_folder)

	emb_filename = 'unsplash-25k-photos-embeddings.pkl'
	if not os.path.exists(emb_filename): #Download dataset if does not exist
	util.http_get('http://sbert.net/datasets/'+emb_filename, emb_filename)

	with open(emb_filename, 'rb') as fIn:
	img_names, img_emb = pickle.load(fIn)

	query_emb = model.encode([query], convert_to_tensor=True, show_progress_bar=False)

	hits = util.semantic_search(query_emb, img_emb, top_k=1)[0]
	for hit in hits:
	return plt.imread(os.path.join(img_folder, img_names[hit['corpus_id']]))

	def sentence_embedding(sentences):
	model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
	embeddings = model.encode(sentences)
	return embeddings

	def sentence_sim(sentence1, sentence2):
	model = SentenceTransformer('sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2')
	embedding1 = model.encode(sentence1)
	embedding2 = model.encode(sentence2)
	cos_scores = util.pytorch_cos_sim(embedding1, embedding2).cpu().numpy()
	return cos_scores[0][0]

	with gr.Blocks() as demo:
	with gr.Tab("Text Summarization"):
	gr.Markdown("Give a long document, find the sentence that give a good and short summary of the content.")
	text_input = gr.Textbox(label="document")
	text_output = gr.Textbox(label="summatization")
	text_button = gr.Button("Summarize")
	with gr.Tab("Image Search"):
	gr.Markdown("Image search given a user query.")
	with gr.Row():
	image_input = gr.Textbox(label="query")
	image_output = gr.Image(label="image")
	image_button = gr.Button("Search")
	with gr.Tab("Sentence Embedding"):
	gr.Markdown("Embed the given sentence.")
	embed_input = gr.Textbox(label="sentence")
	embed_output = gr.Textbox(label="embedding")
	embed_button = gr.Button("Embed")
	with gr.Tab("Sentence Similarity"):
	gr.Markdown("Calculate the similarity of two sentences.")
	sim_input1 = gr.Textbox(label="sentence_1")
	sim_input2 = gr.Textbox(label="sentence_2")
	sim_output = gr.Textbox(label="similarity")
	sim_button = gr.Button("Calculate")

	text_button.click(embed, inputs=text_input, outputs=text_output)
	image_button.click(search, inputs=image_input, outputs=image_output)
	embed_button.click(sentence_embedding, inputs=embed_input, outputs=embed_output)
	sim_button.click(sentence_sim, inputs=[sim_input1, sim_input2], outputs=sim_output)
	demo.launch()