Spaces:

Soumen
/

Text-Summarization-and-NLP-tasks

Running

File size: 7,599 Bytes

9c37e72
dba2773
9c37e72
68f40bc
d82d18a
9c37e72
 
6e58c44
4834995
bd18577
 
 
 
 
 
 
2f51bd6
c75cc74
ba45265
73bf640
9c37e72
09d4214
 
29e33a8
06dd768
09d4214
0842639
baf370a
9c37e72
 
 
 
1ecea99
fa73ddc
5496661
419e04c
9c37e72
61e5b98
b446f5c
61e5b98
 
 
 
 
 
 
ba45265
29e33a8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d5dc1c
29e33a8
 
 
bc02731
 
3d283a2
 
5496661
 
 
 
 
 
382b684
c7a7627
 
 
 
37287e0
0975d28
 
 
b330918
c9df9a0
 
 
ec4347b
9d1426d
b330918
ebcff05
b330918
 
 
 
 
2f22c01
b330918
8eacb9c
61e5b98
 
 
2d1cbd4
cbcd024
61e5b98
b330918
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d960c1e
b330918
 
 
 
 
bf72d34
2e3a776
b330918
 
ac35f7a
b330918
 
 
 
ac35f7a
b330918
 
 
 
 
 
 
 
bf72d34
ac35f7a
2e3a776
b330918
 
ac35f7a
b330918
 
 
 
ac35f7a
b330918
 
 
 
 
2e3a776
b330918
 
 
 
d960c1e
9d1426d

"""
#App: NLP App with Streamlit
Description
This is a Natural Language Processing(NLP) base Application that is useful for 
Document/Text Summarization from Bangla images and English Images/PDF files.
"""
# Core Pkgs
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
#os.system('sudo apt-get install tesseract-ocr-eng')
#os.system('sudo apt-get install tesseract-ocr-ben')

#os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
#os.system('gunzip ben.traineddata.gz ')
#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
#os.system('pip install -q pytesseract')
#os.system('conda install -c conda-forge poppler')
import streamlit as st
st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sidebar_state="expanded")
st.title("Bangla/English Text Summarizer: Upload Images/Pdf or input texts to summarize!")
import torch
import docx2txt
from PIL import Image 
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_bytes
import pdfplumber
#from line_cor import mark_region
import pdf2image
import requests
import cv2
import numpy as np
import pytesseract
import line_cor
import altair as alt
from transformers import AutoTokenizer, AutoModelWithLMHead
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
from PIL import Image

API_URL0 = "https://api-inference.huggingface.co/models/csebuetnlp/mT5_multilingual_XLSum"
headers0 = {"Authorization": "Bearer "+str(os.environ["t5multilingual"])}

# API_URL1 = "https://api-inference.huggingface.co/models/Michael-Vptn/text-summarization-t5-base"
# headers1 = {"Authorization": "Bearer "+str(os.environ["t5modeleng"])}

# API_URL2 = "https://api-inference.huggingface.co/models/gpt2"
# headers2 = {"Authorization": "Bearer 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"}
@st.cache
def read_pdf(file):
#     images=pdf2image.convert_from_path(file)
#     # print(type(images))
    pdfReader = PdfFileReader(file)
    count = pdfReader.numPages
    all_page_text = " "
    for i in range(count):
        page = pdfReader.getPage(i)
        all_page_text += page.extractText()+" "
    return all_page_text
# def read_pdf_with_pdfplumber(file):
#     # Open the uploaded PDF file with pdfplumber
#     with pdfplumber.open(file) as pdf:
#         extracted_text = ''
#         for page in pdf.pages:
#             extracted_text += page.extract_text()

#     # Display the extracted text
#     #st.text(extracted_text)
#     return extracted_text
tokenizer = AutoTokenizer.from_pretrained('t5-base')
model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)
@st.cache(suppress_st_warning=True)
def engsum(text):
	#st.text("Using Google T5 Transformer ..")
    inputs = tokenizer.encode("summarize: " + text,return_tensors='pt',
										max_length= 512,
										truncation=True)
    summary_ids = model.generate(inputs, max_length=150, min_length=80, length_penalty=5., num_beams=2)
    summary = tokenizer.decode(summary_ids[0])
    st.success(summary[5:-4])
def bansum(text):
    def query(payload):
        response = requests.post(API_URL0, headers=headers0, json=payload)
        return response.json()
    out = query({"inputs": text, "min_length":300})
    if isinstance(out, list) and out[0].get("summary_text"):
       text_output = out[0]["summary_text"]
       st.success(text_output) 
    
# @st.cache
# def save(l):
#     return l
#@st.cache
def main():
    camera_photo=None
    import streamlit as st
    if "photo" not in st.session_state:
        st.session_state["photo"]="not done"
    def change_photo_state():
        st.session_state["photo"]="done"
    with st.container():
        c1, c2, c3 = st.columns([1.5,1.5,1.5])
        message = c1.text_input("Type your text here!")
        Capture=True
        if c3.button("Start Camera"):
           camera_photo = c3.camera_input("Capture a photo to summarize: ", on_change=change_photo_state)
        if c3.button("Stop Camera"):
           Capture =False
           
        uploaded_photo = c2.file_uploader("Upload your Images/PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)
        if st.session_state["photo"]=="done" or message:
            if uploaded_photo and uploaded_photo.type=='application/pdf':
                tet = read_pdf(uploaded_photo)
                # with tempfile.NamedTemporaryFile(delete=False) as temp_file:
                #     temp_file.write(uploaded_photo.read())
                #     temp_file_path = temp_file.name
                    
                # loader = PyPDFLoader(temp_file_path)
                # if loader:
                #     text.extend(loader.load())
                #     os.remove(temp_file_path)
                # text_splitter = CharacterTextSplitter(separator="\n", chunk_size=1000, chunk_overlap=100, length_function=len)
                # text_chunks = text_splitter.split_documents(text)
                values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)])
                text = tet[values[0]*7*10:values[1]*10*100] if values[0]!=len(tet)//(10*100) else tet[len(tet)//(10*100):]
                #st.success(type(text_chunks))
                if st.button("English Pdf Summarize"):
                    st.subheader("Selected text for summarize: ")
                    st.success(text)
                    st.subheader("Summarized Text: ")
                    engsum(text)
            
            elif uploaded_photo and uploaded_photo.type !='application/pdf':
                text=None
                img = Image.open(uploaded_photo)
                img = img.save("img.png")
                img = cv2.imread("img.png")
                st.subheader("Select the summarization type:")
                c4, c5 = st.columns([1,7])
                if c4.button("BENGALI"):
                    text =  pytesseract.image_to_string(img, lang="ben")
                    st.success(text)
                    st.subheader("সারাংশ/সারমর্ম")
                    bansum(text)
                if c5.button("ENGLISH"): 
                    text=pytesseract.image_to_string(img)
                    st.success(text)
                    st.subheader("Summarized Text")
                    engsum(text)
            elif camera_photo:
                text=None
                img = Image.open(camera_photo)
                img = img.save("img.png")
                img = cv2.imread("img.png")
                #text = pytesseract.image_to_string(img) if st.checkbox("Bangla") else pytesseract.image_to_string(img, lang="ben")
                st.subheader("Select the summarization type:")
                #st.image(camera_photo, caption='Uploaded Image', use_column_width=True)
                c6, c7 = st.columns([1,7])
                if c6.button("Bangla"):
                    text =  pytesseract.image_to_string(img, lang="ben")
                    st.success(text)
                    st.subheader("সারাংশ/সারমর্ম")
                    bansum(text)
                if c7.button("English"): 
                    text=pytesseract.image_to_string(img)
                    st.success(text)
                    st.subheader("Summarized Text")
                    engsum(text)
            else:
                text=None
                text = message
                c8, c9 = st.columns([1,7])
                if c8.button("Bangla"):
                    bansum(text)
                if c9.button("English"): 
                    engsum(text) 
            
if __name__ == "__main__":
    main()