|
import gradio as gr |
|
from transformers import AutoTokenizer, AutoModel |
|
import torch |
|
import nltk |
|
from nltk import pos_tag |
|
from nltk.tokenize import word_tokenize |
|
from nltk.chunk import RegexpParser |
|
import requests |
|
|
|
nltk.download('averaged_perceptron_tagger') |
|
nltk.download('punkt') |
|
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens') |
|
model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens') |
|
|
|
def filter_pos(text): |
|
words = word_tokenize(text) |
|
tagged_words = pos_tag(words) |
|
return ' '.join([word for word, tag in tagged_words if tag.startswith('N') or tag.startswith('V') or tag.startswith('J') or tag.startswith('R')]) |
|
|
|
def get_embedding(text): |
|
filtered_text = filter_pos(text) |
|
tokens = tokenizer(filtered_text, return_tensors='pt', padding=True, truncation=True, max_length=128) |
|
with torch.no_grad(): |
|
output = model(**tokens) |
|
return output.last_hidden_state.mean(1) |
|
|
|
def calculate_similarity(text1, text2): |
|
embed1 = get_embedding(text1) |
|
embed2 = get_embedding(text2) |
|
cos = torch.nn.CosineSimilarity(dim=1) |
|
similarity = cos(embed1, embed2) |
|
return f"{similarity.item():.2%} Similarity" |
|
|
|
def report_issue(text1, text2, similarity): |
|
url = 'https://docs.google.com/forms/d/e/1FAIpQLSdABQaCNCmHXDyHLsL2lLsxgu386hv9ALU2UbCVL9bUoIwemQ/formResponse' |
|
data = { |
|
'entry.1041881480': text1, |
|
'entry.1520964719': text2, |
|
'entry.2094809206': similarity |
|
} |
|
response = requests.post(url, data=data) |
|
if response.status_code == 200: |
|
return "Report sent successfully!" |
|
else: |
|
return "Failed to send report." |
|
|
|
def extract_chunks(text): |
|
|
|
grammar = r""" |
|
NP: {<DT>?<JJ>*<NN>+} # Chunk sequences of DT, JJ, NN |
|
PP: {<IN><NP>} # Chunk prepositions followed by NP |
|
VP: {<VB.*><NP|PP>*} # Chunk verbs and their arguments |
|
""" |
|
|
|
words = word_tokenize(text) |
|
tagged_words = pos_tag(words) |
|
chunk_parser = RegexpParser(grammar) |
|
tree = chunk_parser.parse(tagged_words) |
|
|
|
|
|
phrases = [" ".join(word for word, tag in subtree.leaves()) for subtree in tree.subtrees() if subtree.label() in ['NP', 'PP', 'VP']] |
|
return phrases |
|
|
|
with gr.Blocks() as app: |
|
with gr.Row(): |
|
text1 = gr.Textbox(label="Input Text 1") |
|
text2 = gr.Textbox(label="Input Text 2") |
|
with gr.Row(): |
|
button = gr.Button("Calculate Similarity") |
|
output = gr.Text(label="Similarity") |
|
chunks_output = gr.Text(label="Extracted Chunks") |
|
|
|
def combined_function(text1, text2): |
|
similarity = calculate_similarity(text1, text2) |
|
chunks1 = extract_chunks(text1) |
|
chunks2 = extract_chunks(text2) |
|
chunks_text = f"Chunks in Text 1: {chunks1}\nChunks in Text 2: {chunks2}" |
|
return similarity, chunks_text |
|
|
|
button.click( |
|
fn=combined_function, |
|
inputs=[text1, text2], |
|
outputs=[output, chunks_output] |
|
) |
|
|
|
report_button = gr.Button("Send result for better training") |
|
report_button.click( |
|
fn=report_issue, |
|
inputs=[text1, text2, output], |
|
outputs=gr.Text(label="Report Status") |
|
) |
|
|
|
app.launch() |
|
|