Spaces:

heisenberg3376
/

chat-with-pdf

Sleeping

App Files Files Community

chat-with-pdf / app.py

heisenberg3376

Update app.py

f1e2997 verified 3 months ago

raw

history blame contribute delete

3.83 kB

	import gradio as gr
	import torch
	import pdfplumber
	import transformers
	from transformers import pipeline
	from transformers import AutoTokenizer, AutoModelForQuestionAnswering
	import fitz

	tokenizer = AutoTokenizer.from_pretrained("aware-ai/bart-squadv2")
	model = AutoModelForQuestionAnswering.from_pretrained("aware-ai/bart-squadv2")

	from transformers import BartTokenizer, BartForQuestionAnswering
	import torch

	# def qna()
	# question, text = "Explain about Saddam Hussein and his career", "Saddam Hussein[c] (28 April 1937 – 30 December 2006) was an Iraqi politician and revolutionary who served as the fifth president of Iraq from 1979 to 2003. He also served as prime minister of Iraq from 1979 to 1991 and later from 1994 to 2003. He was a leading member of the revolutionary Arab Socialist Ba'ath Party and later its Iraqi regional branch. Ideologically, he espoused Ba'athism, a mix of Arab nationalism and Arab socialism, while the policies and political ideas he championed are collectively known as Saddamism.Saddam was born in the village of Al-Awja, near Tikrit in northern Iraq, to a Sunni Arab family.[5] He joined the Ba'ath Party in 1957, and later in 1966 the Iraqi and Baghdad-based Ba'ath parties. He played a key role in the 17 July Revolution and was appointed vice president of Iraq by Ahmed Hassan al-Bakr. During his time as vice president, Saddam nationalized the Iraq Petroleum Company, diversifying the Iraqi economy. He presided over the Second Iraqi–Kurdish War (1974–1975). Following al-Bakr's resignation in 1979, Saddam formally took power, although he had already been the de facto head of Iraq for several years. Positions of power in the country were mostly filled with Sunni Arabs, a minority that made up about a fifth of the population"
	# encoding = tokenizer(question, text, return_tensors='pt')
	# input_ids = encoding['input_ids']
	# attention_mask = encoding['attention_mask']

	# start_scores, end_scores = model(input_ids, attention_mask=attention_mask, output_attentions=False)[:2]

	# all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
	# answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
	# answer = tokenizer.convert_tokens_to_ids(answer.split())
	# answer = tokenizer.decode(answer)
	# #answer => 'a nice puppet'


	# Define the function to process the input
	def answer_question(question, pdf_text):
	encoding = tokenizer(question, pdf_text, return_tensors='pt')
	input_ids = encoding['input_ids']
	attention_mask = encoding['attention_mask']

	start_scores, end_scores = model(input_ids, attention_mask=attention_mask, output_attentions=False)[:2]

	all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
	answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
	answer = tokenizer.convert_tokens_to_ids(answer.split())
	answer = tokenizer.decode(answer)

	return answer

	import pdfplumber

	import fitz # PyMuPDF

	def extract_text_from_pdf(pdf_path):
	text = ""
	pdf = fitz.open(pdf_path)
	for page_num in range(len(pdf)):
	page = pdf[page_num]
	text += page.get_text()
	return text


	def chat_with_pdf(question, pdf):
	pdf_text = extract_text_from_pdf(pdf)
	return answer_question(question, pdf_text)

	# Set up Gradio interface
	interface = gr.Interface(
	fn=chat_with_pdf,
	inputs=[
	gr.Textbox(lines=2, placeholder="Enter your question here..."),
	gr.File(type="filepath", file_types=[".pdf"], label="Upload PDF"),
	],
	outputs=gr.Textbox(label="Answer"),
	title="Chat with PDF",
	description="Upload a PDF and ask questions about its content. Make sure the PDF is NOT too LARGE"
	)

	# Launch the app
	if __name__ == "__main__":
	interface.launch()