import streamlit as st from langchain.text_splitter import RecursiveCharacterTextSplitter from langchain.document_loaders import PyPDFLoader from transformers import T5Tokenizer, T5ForConditionalGeneration from transformers import pipeline import base64 from huggingface_hub import login import torch import fitz # PyMuPDF # model and tokenizer loading checkpoint = "MBZUAI/LaMini-Flan-T5-248M" # checkpoint = "google/flan-t5-base" tokenizer = T5Tokenizer.from_pretrained(checkpoint) base_model = T5ForConditionalGeneration.from_pretrained(checkpoint, device_map='auto', torch_dtype=torch.float32) # LLM pipeline def llm_pipeline(pdf_contents): # Extract text from the PDF contents pdf_document = fitz.open(stream=pdf_contents, filetype="pdf") pdf_text = "" for page_num in range(pdf_document.page_count): page = pdf_document.load_page(page_num) pdf_text += page.get_text() # Use the pipeline to generate the summary pipe_sum = pipeline( 'summarization', model=base_model, tokenizer=tokenizer, max_length=500, min_length=50 ) result = pipe_sum(pdf_text) summary = result[0]['summary_text'] return summary # Streamlit code st.set_page_config(layout="wide") def main(): st.title("Document Summarization App using Language Model") uploaded_file = st.file_uploader("Upload your PDF file", type=['pdf']) if uploaded_file is not None: if st.button("Summarize"): summary = llm_pipeline(uploaded_file.read()) # Display the summary st.info("Summarization Complete") st.success(summary) if __name__ == "__main__": main()