|
from huggingface_hub import from_pretrained_keras |
|
import numpy as np |
|
import gradio as gr |
|
import transformers |
|
import tensorflow as tf |
|
|
|
class BertSemanticDataGenerator(tf.keras.utils.Sequence): |
|
"""Generates batches of data.""" |
|
def __init__( |
|
self, |
|
sentence_pairs, |
|
labels, |
|
batch_size=32, |
|
shuffle=True, |
|
include_targets=True, |
|
): |
|
self.sentence_pairs = sentence_pairs |
|
self.labels = labels |
|
self.shuffle = shuffle |
|
self.batch_size = batch_size |
|
self.include_targets = include_targets |
|
|
|
|
|
self.tokenizer = transformers.BertTokenizer.from_pretrained( |
|
"bert-base-uncased", do_lower_case=True |
|
) |
|
self.indexes = np.arange(len(self.sentence_pairs)) |
|
self.on_epoch_end() |
|
|
|
def __len__(self): |
|
|
|
return len(self.sentence_pairs) // self.batch_size |
|
|
|
def __getitem__(self, idx): |
|
|
|
indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size] |
|
sentence_pairs = self.sentence_pairs[indexes] |
|
|
|
|
|
|
|
encoded = self.tokenizer.batch_encode_plus( |
|
sentence_pairs.tolist(), |
|
add_special_tokens=True, |
|
max_length=128, |
|
return_attention_mask=True, |
|
return_token_type_ids=True, |
|
pad_to_max_length=True, |
|
return_tensors="tf", |
|
) |
|
|
|
|
|
input_ids = np.array(encoded["input_ids"], dtype="int32") |
|
attention_masks = np.array(encoded["attention_mask"], dtype="int32") |
|
token_type_ids = np.array(encoded["token_type_ids"], dtype="int32") |
|
|
|
|
|
if self.include_targets: |
|
labels = np.array(self.labels[indexes], dtype="int32") |
|
return [input_ids, attention_masks, token_type_ids], labels |
|
else: |
|
return [input_ids, attention_masks, token_type_ids] |
|
|
|
model = from_pretrained_keras("avishek-018/bert-semantic-similarity") |
|
labels = ["contradiction", "entailment", "neutral"] |
|
|
|
def predict(sentence1, sentence2): |
|
sentence_pairs = np.array([[str(sentence1), str(sentence2)]]) |
|
test_data = BertSemanticDataGenerator( |
|
sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False, |
|
) |
|
probs = model.predict(test_data[0])[0] |
|
|
|
labels_probs = {labels[i]: float(probs[i]) for i, _ in enumerate(labels)} |
|
return labels_probs |
|
|
|
|
|
|
|
|
|
|
|
|
|
inputs = [ |
|
gr.Audio(source = "upload", label='Upload audio file', type="filepath"), |
|
] |
|
|
|
examples = [["Two women are observing something together.", "Two women are standing with their eyes closed."], |
|
["A smiling costumed woman is holding an umbrella", "A happy woman in a fairy costume holds an umbrella"], |
|
["A soccer game with multiple males playing", "Some men are playing a sport"], |
|
] |
|
|
|
gr.Interface( |
|
fn=predict, |
|
title="Semantic Similarity with BERT", |
|
description = "Natural Language Inference by fine-tuning BERT model on SNLI Corpus 📰 - by Avishek Das", |
|
inputs=["text", "text"], |
|
examples=examples, |
|
|
|
outputs=gr.outputs.Label(num_top_classes=3, label='Semantic similarity'), |
|
cache_examples=True, |
|
article = "Author: <a href=\"https://huggingface.co/avishek-018\">Avishek Das</a>.", |
|
).launch(debug=True, enable_queue=True) |