chat-with-pdf / app.py
heisenberg3376's picture
Update app.py
f1e2997 verified
import gradio as gr
import torch
import pdfplumber
import transformers
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
import fitz
tokenizer = AutoTokenizer.from_pretrained("aware-ai/bart-squadv2")
model = AutoModelForQuestionAnswering.from_pretrained("aware-ai/bart-squadv2")
from transformers import BartTokenizer, BartForQuestionAnswering
import torch
# def qna()
# question, text = "Explain about Saddam Hussein and his career", "Saddam Hussein[c] (28 April 1937 – 30 December 2006) was an Iraqi politician and revolutionary who served as the fifth president of Iraq from 1979 to 2003. He also served as prime minister of Iraq from 1979 to 1991 and later from 1994 to 2003. He was a leading member of the revolutionary Arab Socialist Ba'ath Party and later its Iraqi regional branch. Ideologically, he espoused Ba'athism, a mix of Arab nationalism and Arab socialism, while the policies and political ideas he championed are collectively known as Saddamism.Saddam was born in the village of Al-Awja, near Tikrit in northern Iraq, to a Sunni Arab family.[5] He joined the Ba'ath Party in 1957, and later in 1966 the Iraqi and Baghdad-based Ba'ath parties. He played a key role in the 17 July Revolution and was appointed vice president of Iraq by Ahmed Hassan al-Bakr. During his time as vice president, Saddam nationalized the Iraq Petroleum Company, diversifying the Iraqi economy. He presided over the Second Iraqi–Kurdish War (1974–1975). Following al-Bakr's resignation in 1979, Saddam formally took power, although he had already been the de facto head of Iraq for several years. Positions of power in the country were mostly filled with Sunni Arabs, a minority that made up about a fifth of the population"
# encoding = tokenizer(question, text, return_tensors='pt')
# input_ids = encoding['input_ids']
# attention_mask = encoding['attention_mask']
# start_scores, end_scores = model(input_ids, attention_mask=attention_mask, output_attentions=False)[:2]
# all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
# answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
# answer = tokenizer.convert_tokens_to_ids(answer.split())
# answer = tokenizer.decode(answer)
# #answer => 'a nice puppet'
# Define the function to process the input
def answer_question(question, pdf_text):
encoding = tokenizer(question, pdf_text, return_tensors='pt')
input_ids = encoding['input_ids']
attention_mask = encoding['attention_mask']
start_scores, end_scores = model(input_ids, attention_mask=attention_mask, output_attentions=False)[:2]
all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0])
answer = ' '.join(all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores)+1])
answer = tokenizer.convert_tokens_to_ids(answer.split())
answer = tokenizer.decode(answer)
return answer
import pdfplumber
import fitz # PyMuPDF
def extract_text_from_pdf(pdf_path):
text = ""
pdf = fitz.open(pdf_path)
for page_num in range(len(pdf)):
page = pdf[page_num]
text += page.get_text()
return text
def chat_with_pdf(question, pdf):
pdf_text = extract_text_from_pdf(pdf)
return answer_question(question, pdf_text)
# Set up Gradio interface
interface = gr.Interface(
fn=chat_with_pdf,
inputs=[
gr.Textbox(lines=2, placeholder="Enter your question here..."),
gr.File(type="filepath", file_types=[".pdf"], label="Upload PDF"),
],
outputs=gr.Textbox(label="Answer"),
title="Chat with PDF",
description="Upload a PDF and ask questions about its content. Make sure the PDF is NOT too LARGE"
)
# Launch the app
if __name__ == "__main__":
interface.launch()