doc_summaryLLM / app.py
aps19's picture
Update app.py
2135ce2
import streamlit as st
import fitz # PyMuPDF
from transformers import T5Tokenizer, T5ForConditionalGeneration
from transformers import pipeline
# Model and tokenizer loading
checkpoint = "./model/LaMini-Flan-T5-248M"
tokenizer = T5Tokenizer.from_pretrained(checkpoint)
base_model = T5ForConditionalGeneration.from_pretrained(checkpoint)
# LLM pipeline
def llm_pipeline(pdf_contents, max_length=500, min_length=50):
# Extract text from the PDF contents
pdf_document = fitz.open(stream=pdf_contents, filetype="pdf")
pdf_text = ""
for page_num in range(pdf_document.page_count):
page = pdf_document.load_page(page_num)
pdf_text += page.get_text()
# Use the pipeline to generate the summary
pipe_sum = pipeline(
'summarization',
model=base_model,
tokenizer=tokenizer,
max_length=max_length,
min_length=min_length
)
result = pipe_sum(pdf_text)
summary = result[0]['summary_text']
return summary
# Streamlit code
st.set_page_config(layout="wide")
def main():
st.title("Document Summarization using a Language Model")
# Button to upload a PDF file
uploaded_file = st.file_uploader("Upload a PDF file", type=["pdf"])
if uploaded_file is not None:
max_length = st.slider("Maximum Summary Length", min_value=50, max_value=1000, step=20, value=500)
min_length = st.slider("Minimum Summary Length", min_value=10, max_value=500, step=10, value=50)
if st.button("Summarize"):
# Check if the uploaded file is a PDF
if uploaded_file.type == "application/pdf":
summary = llm_pipeline(uploaded_file.read(), max_length, min_length)
# Display the summary
st.info("Summarization Complete")
st.success(summary)
else:
st.error("Please upload a valid PDF file.")
if __name__ == "__main__":
main()