Spaces:

avishek-018
/

bert-semantic-similarity

Sleeping

App Files Files Community

bert-semantic-similarity / app.py

avishek-018

Update app.py

48be65b about 1 year ago

raw

history blame contribute delete

No virus

3.96 kB

	from huggingface_hub import from_pretrained_keras
	import numpy as np
	import gradio as gr
	import transformers
	import tensorflow as tf

	class BertSemanticDataGenerator(tf.keras.utils.Sequence):
	"""Generates batches of data."""
	def __init__(
	self,
	sentence_pairs,
	labels,
	batch_size=32,
	shuffle=True,
	include_targets=True,
	):
	self.sentence_pairs = sentence_pairs
	self.labels = labels
	self.shuffle = shuffle
	self.batch_size = batch_size
	self.include_targets = include_targets
	# Load our BERT Tokenizer to encode the text.
	# We will use base-base-uncased pretrained model.
	self.tokenizer = transformers.BertTokenizer.from_pretrained(
	"bert-base-uncased", do_lower_case=True
	)
	self.indexes = np.arange(len(self.sentence_pairs))
	self.on_epoch_end()

	def __len__(self):
	# Denotes the number of batches per epoch.
	return len(self.sentence_pairs) // self.batch_size

	def __getitem__(self, idx):
	# Retrieves the batch of index.
	indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
	sentence_pairs = self.sentence_pairs[indexes]

	# With BERT tokenizer's batch_encode_plus batch of both the sentences are
	# encoded together and separated by [SEP] token.
	encoded = self.tokenizer.batch_encode_plus(
	sentence_pairs.tolist(),
	add_special_tokens=True,
	max_length=128,
	return_attention_mask=True,
	return_token_type_ids=True,
	pad_to_max_length=True,
	return_tensors="tf",
	)

	# Convert batch of encoded features to numpy array.
	input_ids = np.array(encoded["input_ids"], dtype="int32")
	attention_masks = np.array(encoded["attention_mask"], dtype="int32")
	token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")

	# Set to true if data generator is used for training/validation.
	if self.include_targets:
	labels = np.array(self.labels[indexes], dtype="int32")
	return [input_ids, attention_masks, token_type_ids], labels
	else:
	return [input_ids, attention_masks, token_type_ids]

	model = from_pretrained_keras("avishek-018/bert-semantic-similarity")
	labels = ["contradiction", "entailment", "neutral"]

	def predict(sentence1, sentence2):
	sentence_pairs = np.array([[str(sentence1), str(sentence2)]])
	test_data = BertSemanticDataGenerator(
	sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
	)
	probs = model.predict(test_data[0])[0]

	labels_probs = {labels[i]: float(probs[i]) for i, _ in enumerate(labels)}
	return labels_probs

	#idx = np.argmax(proba)
	#proba = f"{proba[idx]*100:.2f}%"
	#pred = labels[idx]
	#return f'The semantic similarity of two input sentences is {pred} with {proba} of probability'

	inputs = [
	gr.Audio(source = "upload", label='Upload audio file', type="filepath"),
	]

	examples = [["Two women are observing something together.", "Two women are standing with their eyes closed."],
	["A smiling costumed woman is holding an umbrella", "A happy woman in a fairy costume holds an umbrella"],
	["A soccer game with multiple males playing", "Some men are playing a sport"],
	]

	gr.Interface(
	fn=predict,
	title="Semantic Similarity with BERT",
	description = "Natural Language Inference by fine-tuning BERT model on SNLI Corpus 📰 - by Avishek Das",
	inputs=["text", "text"],
	examples=examples,
	#outputs=gr.Textbox(label='Prediction'),
	outputs=gr.outputs.Label(num_top_classes=3, label='Semantic similarity'),
	cache_examples=True,
	article = "Author: <a href=\"https://huggingface.co/avishek-018\">Avishek Das</a>.",
	).launch(debug=True, enable_queue=True)