Soumen's picture
Update app.py
2f22c01 verified
"""
#App: NLP App with Streamlit
Description
This is a Natural Language Processing(NLP) base Application that is useful for
Document/Text Summarization from Bangla images and English Images/PDF files.
"""
# Core Pkgs
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
#os.system('sudo apt-get install tesseract-ocr-eng')
#os.system('sudo apt-get install tesseract-ocr-ben')
#os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
#os.system('gunzip ben.traineddata.gz ')
#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
#os.system('pip install -q pytesseract')
#os.system('conda install -c conda-forge poppler')
import streamlit as st
st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sidebar_state="expanded")
st.title("Bangla/English Text Summarizer: Upload Images/Pdf or input texts to summarize!")
import torch
import docx2txt
from PIL import Image
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_bytes
import pdfplumber
#from line_cor import mark_region
import pdf2image
import requests
import cv2
import numpy as np
import pytesseract
import line_cor
import altair as alt
from transformers import AutoTokenizer, AutoModelWithLMHead
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
from PIL import Image
API_URL0 = "https://api-inference.huggingface.co/models/csebuetnlp/mT5_multilingual_XLSum"
headers0 = {"Authorization": "Bearer "+str(os.environ["t5multilingual"])}
# API_URL1 = "https://api-inference.huggingface.co/models/Michael-Vptn/text-summarization-t5-base"
# headers1 = {"Authorization": "Bearer "+str(os.environ["t5modeleng"])}
# API_URL2 = "https://api-inference.huggingface.co/models/gpt2"
# headers2 = {"Authorization": "Bearer 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"}
@st.cache
def read_pdf(file):
# images=pdf2image.convert_from_path(file)
# # print(type(images))
pdfReader = PdfFileReader(file)
count = pdfReader.numPages
all_page_text = " "
for i in range(count):
page = pdfReader.getPage(i)
all_page_text += page.extractText()+" "
return all_page_text
# def read_pdf_with_pdfplumber(file):
# # Open the uploaded PDF file with pdfplumber
# with pdfplumber.open(file) as pdf:
# extracted_text = ''
# for page in pdf.pages:
# extracted_text += page.extract_text()
# # Display the extracted text
# #st.text(extracted_text)
# return extracted_text
tokenizer = AutoTokenizer.from_pretrained('t5-base')
model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)
@st.cache(suppress_st_warning=True)
def engsum(text):
#st.text("Using Google T5 Transformer ..")
inputs = tokenizer.encode("summarize: " + text,return_tensors='pt',
max_length= 512,
truncation=True)
summary_ids = model.generate(inputs, max_length=150, min_length=80, length_penalty=5., num_beams=2)
summary = tokenizer.decode(summary_ids[0])
st.success(summary[5:-4])
def bansum(text):
def query(payload):
response = requests.post(API_URL0, headers=headers0, json=payload)
return response.json()
out = query({"inputs": text, "min_length":300})
if isinstance(out, list) and out[0].get("summary_text"):
text_output = out[0]["summary_text"]
st.success(text_output)
# @st.cache
# def save(l):
# return l
#@st.cache
def main():
camera_photo=None
import streamlit as st
if "photo" not in st.session_state:
st.session_state["photo"]="not done"
def change_photo_state():
st.session_state["photo"]="done"
with st.container():
c1, c2, c3 = st.columns([1.5,1.5,1.5])
message = c1.text_input("Type your text here!")
Capture=True
if c3.button("Start Camera"):
camera_photo = c3.camera_input("Capture a photo to summarize: ", on_change=change_photo_state)
if c3.button("Stop Camera"):
Capture =False
uploaded_photo = c2.file_uploader("Upload your Images/PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)
if st.session_state["photo"]=="done" or message:
if uploaded_photo and uploaded_photo.type=='application/pdf':
tet = read_pdf(uploaded_photo)
# with tempfile.NamedTemporaryFile(delete=False) as temp_file:
# temp_file.write(uploaded_photo.read())
# temp_file_path = temp_file.name
# loader = PyPDFLoader(temp_file_path)
# if loader:
# text.extend(loader.load())
# os.remove(temp_file_path)
# text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len)
# text_chunks = text_splitter.split_documents(text)
values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)])
text = tet[values[0]*7*10:values[1]*10*100] if values[0]!=len(tet)//(10*100) else tet[len(tet)//(10*100):]
#st.success(type(text_chunks))
if st.button("English Pdf Summarize"):
st.subheader("Selected text for summarize: ")
st.success(text)
st.subheader("Summarized Text: ")
engsum(text)
elif uploaded_photo and uploaded_photo.type !='application/pdf':
text=None
img = Image.open(uploaded_photo)
img = img.save("img.png")
img = cv2.imread("img.png")
st.subheader("Select the summarization type:")
c4, c5 = st.columns([1,7])
if c4.button("BENGALI"):
text = pytesseract.image_to_string(img, lang="ben")
st.success(text)
st.subheader("সারাংশ/সারমর্ম")
bansum(text)
if c5.button("ENGLISH"):
text=pytesseract.image_to_string(img)
st.success(text)
st.subheader("Summarized Text")
engsum(text)
elif camera_photo:
text=None
img = Image.open(camera_photo)
img = img.save("img.png")
img = cv2.imread("img.png")
#text = pytesseract.image_to_string(img) if st.checkbox("Bangla") else pytesseract.image_to_string(img, lang="ben")
st.subheader("Select the summarization type:")
#st.image(camera_photo, caption='Uploaded Image', use_column_width=True)
c6, c7 = st.columns([1,7])
if c6.button("Bangla"):
text = pytesseract.image_to_string(img, lang="ben")
st.success(text)
st.subheader("সারাংশ/সারমর্ম")
bansum(text)
if c7.button("English"):
text=pytesseract.image_to_string(img)
st.success(text)
st.subheader("Summarized Text")
engsum(text)
else:
text=None
text = message
c8, c9 = st.columns([1,7])
if c8.button("Bangla"):
bansum(text)
if c9.button("English"):
engsum(text)
if __name__ == "__main__":
main()