Spaces:

avishek-018
/

bert-semantic-similarity

Sleeping

App Files Files Community

avishek-018 commited on Jul 14, 2023

Commit

ec9a5c9

•

1 Parent(s): 849bcc6

Create app.py

Browse files

Files changed (1) hide show

app.py +99 -0

app.py ADDED Viewed

	@@ -0,0 +1,99 @@

+from huggingface_hub import from_pretrained_keras
+import numpy as np
+import gradio as gr
+import transformers
+import tensorflow as tf
+class BertSemanticDataGenerator(tf.keras.utils.Sequence):
+    """Generates batches of data."""
+    def __init__(
+        self,
+        sentence_pairs,
+        labels,
+        batch_size=32,
+        shuffle=True,
+        include_targets=True,
+    ):
+        self.sentence_pairs = sentence_pairs
+        self.labels = labels
+        self.shuffle = shuffle
+        self.batch_size = batch_size
+        self.include_targets = include_targets
+        # Load our BERT Tokenizer to encode the text.
+        # We will use base-base-uncased pretrained model.
+        self.tokenizer = transformers.BertTokenizer.from_pretrained(
+            "bert-base-uncased", do_lower_case=True
+        )
+        self.indexes = np.arange(len(self.sentence_pairs))
+        self.on_epoch_end()
+    def __len__(self):
+        # Denotes the number of batches per epoch.
+        return len(self.sentence_pairs) // self.batch_size
+    def __getitem__(self, idx):
+        # Retrieves the batch of index.
+        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
+        sentence_pairs = self.sentence_pairs[indexes]
+        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
+        # encoded together and separated by [SEP] token.
+        encoded = self.tokenizer.batch_encode_plus(
+            sentence_pairs.tolist(),
+            add_special_tokens=True,
+            max_length=128,
+            return_attention_mask=True,
+            return_token_type_ids=True,
+            pad_to_max_length=True,
+            return_tensors="tf",
+        )
+        # Convert batch of encoded features to numpy array.
+        input_ids = np.array(encoded["input_ids"], dtype="int32")
+        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
+        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")
+        # Set to true if data generator is used for training/validation.
+        if self.include_targets:
+            labels = np.array(self.labels[indexes], dtype="int32")
+            return [input_ids, attention_masks, token_type_ids], labels
+        else:
+            return [input_ids, attention_masks, token_type_ids]
+model = from_pretrained_keras("avishek-018/bert-semantic-similarity")
+labels = ["contradiction", "entailment", "neutral"]
+def predict(sentence1, sentence2):
+    sentence_pairs = np.array([[str(sentence1), str(sentence2)]])
+    test_data = BertSemanticDataGenerator(
+        sentence_pairs, labels=None, batch_size=1, shuffle=False, include_targets=False,
+    )
+    probs = model.predict(test_data[0])[0]
+    labels_probs = {labels[i]: float(probs[i]) for i, _ in enumerate(labels)}
+    return labels_probs
+    #idx = np.argmax(proba)
+    #proba = f"{proba[idx]*100:.2f}%"
+    #pred = labels[idx]
+    #return f'The semantic similarity of two input sentences is {pred} with {proba} of probability'
+inputs = [
+         gr.Audio(source = "upload", label='Upload audio file', type="filepath"),
+]
+examples = [["Two women are observing something together.", "Two women are standing with their eyes closed."],
+            ["A smiling costumed woman is holding an umbrella", "A happy woman in a fairy costume holds an umbrella"],
+            ["A soccer game with multiple males playing", "Some men are playing a sport"],
+]
+gr.Interface(
+    fn=predict,
+    title="Semantic Similarity with BERT",
+    description = "Natural Language Inference by fine-tuning BERT model on SNLI Corpus 📰",
+    inputs=["text", "text"],
+    examples=examples,
+    #outputs=gr.Textbox(label='Prediction'),
+    outputs=gr.outputs.Label(num_top_classes=3, label='Semantic similarity'),
+    cache_examples=True,
+    ).launch(debug=True, enable_queue=True)