PDF_TEXT_QA / app.py
raghuram13's picture
Update app.py
0c86a34
import gradio as gr
import PyPDF2
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
nltk.download('punkt')
nltk.download('stopwords')
def extract_text(file):
"""
This function takes a PDF file and returns the extracted text.
"""
pdf_file = open(file.name, 'rb')
read_pdf = PyPDF2.PdfReader(pdf_file)
num_pages = len(read_pdf.pages)
text = ""
for i in range(num_pages):
page = read_pdf.pages[i]
text += page.extract_text ()
return text
def generate_answers(text, question):
"""
This function takes the extracted text and a question and generates an answer.
"""
# Tokenize the text and question
sentences = nltk.sent_tokenize(text)
stop_words = set(stopwords.words('english'))
words = nltk.word_tokenize(question.lower())
# Generate TF-IDF matrix
vectorizer = TfidfVectorizer(stop_words=stop_words)
X = vectorizer.fit_transform(sentences)
# Calculate cosine similarity matrix
cos_sim_matrix = cosine_similarity(X)
# Find the sentence with the highest similarity to the question
max_sim = -1
max_idx = -1
for i in range(len(sentences)):
sim = 0
for word in words:
sim += cos_sim_matrix[i][vectorizer.vocabulary_.get(word, 0)]
if sim > max_sim:
max_sim = sim
max_idx = i
# Return the sentence with the highest similarity as the answer
if max_idx != -1:
answer = sentences[max_idx]
else:
answer = "I'm sorry, I couldn't find an answer to that question."
return answer
# Create the Gradio app interface
def app():
file_input = gr.inputs.File(label="Upload PDF Document")
output_text = gr.outputs.Textbox(label="Extracted Text")
question_input = gr.inputs.Textbox(label="Enter a question")
output_answer = gr.outputs.Textbox(label="Answer")
def predict(file, question):
# Extract text from the file
text = extract_text(file)
# Generate an answer to the question
answer = generate_answers(text, question)
return text, answer
# Create the interface and run the app
iface = gr.Interface(fn=predict, inputs=[file_input, question_input], outputs=[output_text, output_answer],
title="PDF QA Generator")
iface.launch()
if __name__ == '__main__':
app()