Spaces:

raghuram13
/

PDF_TEXT_QA

Runtime error

App Files Files Community

PDF_TEXT_QA / app.py

raghuram13

Update app.py

0c86a34 over 2 years ago

raw

history blame contribute delete

2.48 kB

	import gradio as gr
	import PyPDF2
	import nltk
	from nltk.corpus import stopwords
	from sklearn.feature_extraction.text import TfidfVectorizer
	from sklearn.metrics.pairwise import cosine_similarity

	nltk.download('punkt')
	nltk.download('stopwords')

	def extract_text(file):
	"""
	This function takes a PDF file and returns the extracted text.
	"""
	pdf_file = open(file.name, 'rb')
	read_pdf = PyPDF2.PdfReader(pdf_file)
	num_pages = len(read_pdf.pages)
	text = ""

	for i in range(num_pages):
	page = read_pdf.pages[i]
	text += page.extract_text ()

	return text


	def generate_answers(text, question):
	"""
	This function takes the extracted text and a question and generates an answer.
	"""
	# Tokenize the text and question
	sentences = nltk.sent_tokenize(text)
	stop_words = set(stopwords.words('english'))
	words = nltk.word_tokenize(question.lower())

	# Generate TF-IDF matrix
	vectorizer = TfidfVectorizer(stop_words=stop_words)
	X = vectorizer.fit_transform(sentences)

	# Calculate cosine similarity matrix
	cos_sim_matrix = cosine_similarity(X)

	# Find the sentence with the highest similarity to the question
	max_sim = -1
	max_idx = -1
	for i in range(len(sentences)):
	sim = 0
	for word in words:
	sim += cos_sim_matrix[i][vectorizer.vocabulary_.get(word, 0)]
	if sim > max_sim:
	max_sim = sim
	max_idx = i

	# Return the sentence with the highest similarity as the answer
	if max_idx != -1:
	answer = sentences[max_idx]
	else:
	answer = "I'm sorry, I couldn't find an answer to that question."

	return answer


	# Create the Gradio app interface
	def app():
	file_input = gr.inputs.File(label="Upload PDF Document")
	output_text = gr.outputs.Textbox(label="Extracted Text")
	question_input = gr.inputs.Textbox(label="Enter a question")
	output_answer = gr.outputs.Textbox(label="Answer")

	def predict(file, question):
	# Extract text from the file
	text = extract_text(file)

	# Generate an answer to the question
	answer = generate_answers(text, question)

	return text, answer

	# Create the interface and run the app
	iface = gr.Interface(fn=predict, inputs=[file_input, question_input], outputs=[output_text, output_answer],
	title="PDF QA Generator")
	iface.launch()


	if __name__ == '__main__':
	app()