a-guy-from-burma's picture
Update app.py
84949b2 verified
import gradio as gr
from transformers import AutoTokenizer, AutoModel
import torch
import nltk
from nltk import pos_tag
from nltk.tokenize import word_tokenize
from nltk.chunk import RegexpParser
import requests
nltk.download('averaged_perceptron_tagger')
nltk.download('punkt')
# Load tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens')
def filter_pos(text):
words = word_tokenize(text)
tagged_words = pos_tag(words)
return ' '.join([word for word, tag in tagged_words if tag.startswith('N') or tag.startswith('V') or tag.startswith('J') or tag.startswith('R')])
def get_embedding(text):
filtered_text = filter_pos(text)
tokens = tokenizer(filtered_text, return_tensors='pt', padding=True, truncation=True, max_length=128)
with torch.no_grad():
output = model(**tokens)
return output.last_hidden_state.mean(1)
def calculate_similarity(text1, text2):
embed1 = get_embedding(text1)
embed2 = get_embedding(text2)
cos = torch.nn.CosineSimilarity(dim=1)
similarity = cos(embed1, embed2)
return f"{similarity.item():.2%} Similarity"
def report_issue(text1, text2, similarity):
url = 'https://docs.google.com/forms/d/e/1FAIpQLSdABQaCNCmHXDyHLsL2lLsxgu386hv9ALU2UbCVL9bUoIwemQ/formResponse'
data = {
'entry.1041881480': text1,
'entry.1520964719': text2,
'entry.2094809206': similarity
}
response = requests.post(url, data=data)
if response.status_code == 200:
return "Report sent successfully!"
else:
return "Failed to send report."
def extract_chunks(text):
# Define grammar for chunking
grammar = r"""
NP: {<DT>?<JJ>*<NN>+} # Chunk sequences of DT, JJ, NN
PP: {<IN><NP>} # Chunk prepositions followed by NP
VP: {<VB.*><NP|PP>*} # Chunk verbs and their arguments
"""
# Tokenize and POS-tag
words = word_tokenize(text)
tagged_words = pos_tag(words)
chunk_parser = RegexpParser(grammar)
tree = chunk_parser.parse(tagged_words)
# Extract phrases
phrases = [" ".join(word for word, tag in subtree.leaves()) for subtree in tree.subtrees() if subtree.label() in ['NP', 'PP', 'VP']]
return phrases
with gr.Blocks() as app:
with gr.Row():
text1 = gr.Textbox(label="Input Text 1")
text2 = gr.Textbox(label="Input Text 2")
with gr.Row():
button = gr.Button("Calculate Similarity")
output = gr.Text(label="Similarity")
chunks_output = gr.Text(label="Extracted Chunks")
def combined_function(text1, text2):
similarity = calculate_similarity(text1, text2)
chunks1 = extract_chunks(text1)
chunks2 = extract_chunks(text2)
chunks_text = f"Chunks in Text 1: {chunks1}\nChunks in Text 2: {chunks2}"
return similarity, chunks_text
button.click(
fn=combined_function,
inputs=[text1, text2],
outputs=[output, chunks_output]
)
report_button = gr.Button("Send result for better training")
report_button.click(
fn=report_issue,
inputs=[text1, text2, output],
outputs=gr.Text(label="Report Status")
)
app.launch()