import streamlit as st | |
import PyPDF2 | |
from transformers import pipeline | |
import pytesseract | |
from PIL import Image, ImageEnhance, ImageFilter | |
# Load pre-trained model and tokenizercheckpoint ="facebook/bart-large-cnn" | |
checkpoint ="facebook/bart-large-cnn" | |
model = pipeline('summarization', model=checkpoint) | |
# Streamlit UI | |
st.title("Text Summarizer using LLM") | |
# Function to extract text from PDF | |
def extract_text_from_pdf(pdf_file): | |
pdf_reader = PyPDF2.PdfReader(pdf_file) | |
for page_num in range(len(pdf_reader.pages)): | |
page = pdf_reader.pages[page_num] | |
text = page.extract_text() | |
return text | |
def summarize_text(text): | |
summary = model(text, min_length=256, max_length=512, do_sample=True)[0]['summary_text'] | |
return summary | |
# Function to perform OCR on uploaded image | |
def perform_ocr(image): | |
# Enhance image | |
# Perform OCR | |
text = pytesseract.image_to_string(image, lang='eng', config='--psm 3') | |
return text | |
def answering(tex): | |
question_answerer = pipeline("question-answering", model='distilbert-base-cased-distilled-squad') | |
question = st.text_input("Enter your question:") | |
if st.button("Answer Question"): | |
# Generate the answer | |
result = question_answerer(question=question,context=tex) | |
# Display the answer | |
st.subheader("Answer:") | |
st.write(result["answer"]) | |
# Radio button for selecting input format | |
input_format = st.selectbox("Select input format:", ('Text', 'PDF', 'Image')) | |
# PDF input box for the document to be summarized | |
if input_format == 'Text': | |
uploaded_file = st.file_uploader("Upload a text document (.txt)", type="txt") | |
if uploaded_file is not None: | |
text ="utf-8") | |
st.subheader("Original Text:") | |
st.write(text) | |
if st.button("Summarize"): | |
# Generate the summary | |
summary = summarize_text(text) | |
# Display the summary | |
st.subheader("Summary:") | |
st.write(summary) | |
answering(text) | |
elif input_format == 'PDF': | |
uploaded_file = st.file_uploader("Upload a PDF document", type="pdf") | |
if uploaded_file is not None: | |
text = extract_text_from_pdf(uploaded_file) | |
st.subheader("Original Text:") | |
st.write(text) | |
if st.button("Summarize"): | |
# Generate the summary | |
summary = summarize_text(text) | |
# Display the summary | |
st.subheader("Summary:") | |
st.write(summary) | |
answering(text) | |
elif input_format == 'Image': | |
uploaded_image = st.file_uploader("Upload an image", type=["jpg", "jpeg", "png"]) | |
if uploaded_image is not None: | |
image = | |
text = perform_ocr(image) | |
st.subheader("Extracted Text from Image:") | |
st.write(text) | |
if st.button("Summarize"): | |
# Generate the summary | |
summary = summarize_text(text) | |
# Display the summary | |
st.subheader("Summary:") | |
st.write(summary) | |
answering(text) | |