import gradio as gr from transformers import AutoModelForSeq2SeqLM, AutoTokenizer from transformers.pipelines import pipeline from pdfminer.high_level import extract_text from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity import numpy as np # Load T5 model and tokenizer model_name = "t5-large" model = AutoModelForSeq2SeqLM.from_pretrained(model_name) tokenizer = AutoTokenizer.from_pretrained(model_name) qa_pipeline = pipeline("text2text-generation", model=model, tokenizer=tokenizer) def read_pdf(file): return extract_text(file) def retrieve_relevant_text(question, context, top_n=5): vectorizer = TfidfVectorizer().fit_transform([question] + context) vectors = vectorizer.toarray() cosine_matrix = cosine_similarity(vectors) similar_ix = np.argsort(cosine_matrix[0])[::-1][1:top_n+1] relevant_texts = [context[ix] for ix in similar_ix] return " ".join(relevant_texts) def answer_question(pdf, question): context = read_pdf(pdf).split("\n") relevant_text = retrieve_relevant_text(question, context) input_text = f"question: {question} context: {relevant_text}" response = qa_pipeline(input_text, max_length=512, do_sample=False) return response[0]['generated_text'] # Define Gradio interface iface = gr.Interface( fn=answer_question, inputs=[gr.inputs.File(type="file", label="Upload PDF"), gr.inputs.Textbox(lines=2, placeholder="Ask a question")], outputs=gr.outputs.Textbox(label="Answer"), title="PDF Q&A with T5" ) if __name__ == "__main__": iface.launch()