import gradio as gr from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity from transformers import pipeline # Load the pre-trained sentence transformer model model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens') # Load the pre-trained extractive QA model qa_pipeline = pipeline('question-answering', model='distilbert-base-cased-distilled-squad') # Define the example documents and the example security-related question documents = [ "Data breaches are a common occurrence in today's digital world. Companies must take measures to protect sensitive information from unauthorized access or disclosure.", "Phishing attacks are a type of cyberattack that use social engineering to trick users into divulging confidential information. Employees should be trained to recognize and avoid phishing scams.", "The use of encryption can help prevent unauthorized access to data by encrypting it so that it can only be accessed by authorized users who have the decryption key.", "A firewall is a network security system that monitors and controls incoming and outgoing network traffic based on predetermined security rules. It can help protect against unauthorized access to a network.", "Access control is the process of limiting access to resources to only authorized users. It is an important aspect of information security and can be achieved through the use of authentication and authorization mechanisms.", ] question = "What measures can companies take to protect sensitive information from unauthorized access or disclosure?" # Define the function to process the inputs and return the answer def answer_question(document, question): # Generate embeddings for the documents doc_embeddings = model.encode(documents) # Generate the query embedding query_embedding = model.encode([question]) # Compute the cosine similarity between the query and the document embeddings similarity_scores = cosine_similarity(query_embedding, doc_embeddings) # Find the top 3 most similar documents most_similar_idxs = similarity_scores.argsort()[0][-3:] # Extract the answers from the top 3 most similar documents answers = [] for idx in most_similar_idxs: answer = qa_pipeline({'context': documents[idx], 'question': question}) answers.append(answer['answer']) # Return the answers return answers # Define the input and output interfaces for the Gradio app inputs = [ gr.inputs.Textbox(label="Document"), gr.inputs.Textbox(label="Question"), ] outputs = gr.outputs.Textbox(label="Answer") # Create the Gradio app app = gr.Interface(fn=answer_question, inputs=inputs, outputs=outputs, title="Security Question Answering", description="Enter a security question and a document, and the app will return the answer based on the top 3 most similar documents.", examples=[ ["Data breaches are a common occurrence in today's digital world. Companies must take measures to protect sensitive information from unauthorized access or disclosure.", "What measures can companies take to protect sensitive information?"], ["Phishing attacks are a type of cyberattack that use social engineering to trick users into divulging confidential information. Employees should be trained to recognize and avoid phishing scams.", "What is a phishing attack?"], ["The use of encryption can help prevent unauthorized access to data by encrypting it so that it can only be accessed by authorized users who have the decryption key.", "What is encryption?"], ["A firewall is a network security system that monitors and controls incoming and outgoing network traffic based on predetermined security rules. It can help protect against unauthorized access to a network.", "What is a firewall?"] ]) # Run the app app.launch()