import gradio as gr from transformers import AutoModelForQuestionAnswering, AutoTokenizer, pipeline, AutoModelForSeq2SeqLM from pdfminer.high_level import extract_text from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import numpy as np import nltk from nltk.tokenize import sent_tokenize from rank_bm25 import BM25Okapi nltk.download('punkt') # QA model qa_model_name = "deepset/roberta-large-squad2" qa_model = AutoModelForQuestionAnswering.from_pretrained(qa_model_name) qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name) qa_pipeline = pipeline("question-answering", model=qa_model, tokenizer=qa_tokenizer) # Summarization model summarization_model_name = "facebook/bart-large-cnn" summarizer = pipeline("summarization", model=summarization_model_name) def read_pdf(file): try: text = extract_text(file) if not text: raise ValueError("PDF extraction failed.") return text except Exception as e: return str(e) def retrieve_relevant_text_bm25(question, sentences, top_n=3): try: tokenized_corpus = [sent.split() for sent in sentences] bm25 = BM25Okapi(tokenized_corpus) tokenized_query = question.split() doc_scores = bm25.get_scores(tokenized_query) top_n_indices = np.argsort(doc_scores)[::-1][:top_n] relevant_texts = [sentences[i] for i in top_n_indices] return " ".join(relevant_texts) except Exception as e: return str(e) def answer_question(pdf, question, num_words): try: text = read_pdf(pdf) if isinstance(text, str): return text sentences = sent_tokenize(text) relevant_text = retrieve_relevant_text_bm25(question, sentences) response = qa_pipeline(question=question, context=relevant_text) answer = response['answer'] answer = answer.strip() answer = " ".join(answer.split()) if len(answer.split()) > num_words: try: summarized_answer = summarizer(answer, max_length=num_words + 10, min_length=1) answer = summarized_answer[0]['summary_text'] answer = answer.strip() answer = " ".join(answer.split()) if len(answer.split()) > num_words: answer = " ".join(answer.split()[:num_words]) except RuntimeError as e: if "Input length of input_ids is" in str(e) and "but `max_length` is set to" in str(e): answer = " ".join(answer.split()[:num_words]) else: return f"Summarization Error: {e}" except Exception as e: return f"Summarization Error: {e}" elif relevant_text and len(answer.split()) < num_words: remaining_words = num_words - len(answer.split()) added_words = 0 added_sentences = for sentence in sent_tokenize(relevant_text): sentence_words = sentence.split() words_to_add = min(remaining_words - added_words, len(sentence_words)) if words_to_add > 0: added_sentences.append(" ".join(sentence_words[:words_to_add])) added_words += words_to_add if added_words >= remaining_words: break answer += " " + " ".join(added_sentences) answer = answer.strip() answer = " ".join(answer.split()) if len(answer.split()) > num_words: answer = " ".join(answer.split()[:num_words]) return answer.strip() except Exception as e: return str(e) with gr.Blocks() as iface: gr.Markdown("PDF Q&A with RoBERTa") with gr.Row(): with gr.Column(scale=2): question_input = gr.Textbox(lines=2, placeholder="Ask a question", label="Question") btn = gr.Button("Submit") with gr.Column(scale=1): pdf_input = gr.File(type="filepath", label="Upload PDF") num_words_slider = gr.Slider(minimum=1, maximum=500, value=100, step=1, label="Number of Words") answer_output = gr.Textbox(label="Answer", lines=5) btn.click(fn=answer_question, inputs=[pdf_input, question_input, num_words_slider], outputs=answer_output) if __name__ == "__main__": iface.launch()