Spaces:
Runtime error
Runtime error
import gradio as gr | |
import PyPDF2 | |
import nltk | |
from nltk.corpus import stopwords | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
def extract_text(file): | |
""" | |
This function takes a PDF file and returns the extracted text. | |
""" | |
pdf_file = open(file.name, 'rb') | |
read_pdf = PyPDF2.PdfReader(pdf_file) | |
num_pages = len(read_pdf.pages) | |
text = "" | |
for i in range(num_pages): | |
page = read_pdf.pages[i] | |
text += page.extract_text () | |
return text | |
def generate_answers(text, question): | |
""" | |
This function takes the extracted text and a question and generates an answer. | |
""" | |
# Tokenize the text and question | |
sentences = nltk.sent_tokenize(text) | |
stop_words = set(stopwords.words('english')) | |
words = nltk.word_tokenize(question.lower()) | |
# Generate TF-IDF matrix | |
vectorizer = TfidfVectorizer(stop_words=stop_words) | |
X = vectorizer.fit_transform(sentences) | |
# Calculate cosine similarity matrix | |
cos_sim_matrix = cosine_similarity(X) | |
# Find the sentence with the highest similarity to the question | |
max_sim = -1 | |
max_idx = -1 | |
for i in range(len(sentences)): | |
sim = 0 | |
for word in words: | |
sim += cos_sim_matrix[i][vectorizer.vocabulary_.get(word, 0)] | |
if sim > max_sim: | |
max_sim = sim | |
max_idx = i | |
# Return the sentence with the highest similarity as the answer | |
if max_idx != -1: | |
answer = sentences[max_idx] | |
else: | |
answer = "I'm sorry, I couldn't find an answer to that question." | |
return answer | |
# Create the Gradio app interface | |
def app(): | |
file_input = gr.inputs.File(label="Upload PDF Document") | |
output_text = gr.outputs.Textbox(label="Extracted Text") | |
question_input = gr.inputs.Textbox(label="Enter a question") | |
output_answer = gr.outputs.Textbox(label="Answer") | |
def predict(file, question): | |
# Extract text from the file | |
text = extract_text(file) | |
# Generate an answer to the question | |
answer = generate_answers(text, question) | |
return text, answer | |
# Create the interface and run the app | |
iface = gr.Interface(fn=predict, inputs=[file_input, question_input], outputs=[output_text, output_answer], | |
title="PDF QA Generator") | |
iface.launch() | |
if __name__ == '__main__': | |
app() | |