import gradio as gr from transformers import AutoTokenizer, AutoModel import torch import nltk from nltk import pos_tag from nltk.tokenize import word_tokenize from nltk.chunk import RegexpParser import requests nltk.download('averaged_perceptron_tagger') nltk.download('punkt') # Load tokenizer and model tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens') model = AutoModel.from_pretrained('sentence-transformers/bert-base-nli-mean-tokens') def filter_pos(text): words = word_tokenize(text) tagged_words = pos_tag(words) return ' '.join([word for word, tag in tagged_words if tag.startswith('N') or tag.startswith('V') or tag.startswith('J') or tag.startswith('R')]) def get_embedding(text): filtered_text = filter_pos(text) tokens = tokenizer(filtered_text, return_tensors='pt', padding=True, truncation=True, max_length=128) with torch.no_grad(): output = model(**tokens) return output.last_hidden_state.mean(1) def calculate_similarity(text1, text2): embed1 = get_embedding(text1) embed2 = get_embedding(text2) cos = torch.nn.CosineSimilarity(dim=1) similarity = cos(embed1, embed2) return f"{similarity.item():.2%} Similarity" def report_issue(text1, text2, similarity): url = 'https://docs.google.com/forms/d/e/1FAIpQLSdABQaCNCmHXDyHLsL2lLsxgu386hv9ALU2UbCVL9bUoIwemQ/formResponse' data = { 'entry.1041881480': text1, 'entry.1520964719': text2, 'entry.2094809206': similarity } response = requests.post(url, data=data) if response.status_code == 200: return "Report sent successfully!" else: return "Failed to send report." def extract_chunks(text): # Define grammar for chunking grammar = r""" NP: {
?*+} # Chunk sequences of DT, JJ, NN PP: {} # Chunk prepositions followed by NP VP: {*} # Chunk verbs and their arguments """ # Tokenize and POS-tag words = word_tokenize(text) tagged_words = pos_tag(words) chunk_parser = RegexpParser(grammar) tree = chunk_parser.parse(tagged_words) # Extract phrases phrases = [" ".join(word for word, tag in subtree.leaves()) for subtree in tree.subtrees() if subtree.label() in ['NP', 'PP', 'VP']] return phrases with gr.Blocks() as app: with gr.Row(): text1 = gr.Textbox(label="Input Text 1") text2 = gr.Textbox(label="Input Text 2") with gr.Row(): button = gr.Button("Calculate Similarity") output = gr.Text(label="Similarity") chunks_output = gr.Text(label="Extracted Chunks") def combined_function(text1, text2): similarity = calculate_similarity(text1, text2) chunks1 = extract_chunks(text1) chunks2 = extract_chunks(text2) chunks_text = f"Chunks in Text 1: {chunks1}\nChunks in Text 2: {chunks2}" return similarity, chunks_text button.click( fn=combined_function, inputs=[text1, text2], outputs=[output, chunks_output] ) report_button = gr.Button("Send result for better training") report_button.click( fn=report_issue, inputs=[text1, text2, output], outputs=gr.Text(label="Report Status") ) app.launch()