Spaces:

Soumen
/

Text-Summarization-and-NLP-tasks

Running

File size: 6,243 Bytes

9c37e72
dba2773
9c37e72
68f40bc
d82d18a
9c37e72
 
6e58c44
4834995
bd18577
 
 
 
 
 
 
2f51bd6
c75cc74
3d7adba
9c37e72
09d4214
 
 
06dd768
09d4214
0842639
baf370a
9c37e72
 
 
 
1ecea99
fa73ddc
419e04c
9c37e72
b446f5c
a07988a
af0fae9
 
 
 
b446f5c
dd55b25
8c11fa3
 
f176b84
dd55b25
dec4937
6fe4e2e
dd55b25
 
10ef8bd
1a9fa41
c7a7627
 
 
 
 
3f06691
c7a7627
37287e0
c7a7627
0975d28
 
 
c7a7627
 
 
 
37287e0
0975d28
 
 
1a9fa41
0975d28
9f43588
c9a18bc
9c37e72
e113d20
f1ae271
 
0a7287e
5e3c2f2
ea6bf13
63c4e55
 
 
2bfe916
4f3134d
7b01ac0
 
 
d82d18a
ea8c799
82d1709
37287e0
6d5b302
5f35583
37287e0
d0a45f9
4f3134d
 
8cc1e8b
6e25163
dd55b25
a811191
5f35583
c7a7627
d0a45f9
5f35583
c7a7627
d0a45f9
6c1c515
ea6bf13
4f3134d
cd370f7
 
 
c7a7627
a811191
 
c7a7627
d0a45f9
a811191
c7a7627
d0a45f9
ea6bf13
4f3134d
cd370f7
0975d28
ce65dbf
0975d28
d0a45f9
b0e2f8f
 
 
 
7af9178
b0e2f8f
 
 
 
 
 
 
d0a45f9
9c37e72
b9b4937

"""
#App: NLP App with Streamlit
Description
This is a Natural Language Processing(NLP) base Application that is useful for 
Document/Text Summarization from Bangla images and English Images/PDF files.
"""
# Core Pkgs
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
#os.system('sudo apt-get install tesseract-ocr-eng')
#os.system('sudo apt-get install tesseract-ocr-ben')

#os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
#os.system('gunzip ben.traineddata.gz ')
#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
#os.system('pip install -q pytesseract')
#os.system('conda install -c conda-forge poppler')
import streamlit as st
st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sidebar_state="expanded")
import torch
import docx2txt
from PIL import Image 
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_bytes
import pdfplumber
#from line_cor import mark_region
import pdf2image
import requests
import cv2
import numpy as np
import pytesseract
import line_cor
import altair as alt
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
from PIL import Image
API_URL0 = "https://api-inference.huggingface.co/models/csebuetnlp/mT5_multilingual_XLSum"
headers0 = {"Authorization": "Bearer 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"}
API_URL1 = "https://api-inference.huggingface.co/models/Michael-Vptn/text-summarization-t5-base"
headers1 = {"Authorization": "Bearer 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"}
API_URL2 = "https://api-inference.huggingface.co/models/gpt2"
headers2 = {"Authorization": "Bearer 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"}
	
def read_pdf(file):
#     images=pdf2image.convert_from_path(file)
#     # print(type(images))
    pdfReader = PdfFileReader(file)
    count = pdfReader.numPages
    all_page_text = " "
    for i in range(count):
        page = pdfReader.getPage(i)
        all_page_text += page.extractText()+" "
    return all_page_text

def engsum(output):
    def query(payload):
        response = requests.post(API_URL1, headers=headers1, json=payload)
        return response.json()
        
    out = query({
        "inputs": output,
        "min_length":300
    })
    if isinstance(out, list) and out[0].get("generated_text"):
       text_output = out[0]["generated_text"]
       st.success(text_output)
def bansum(text):
    def query(payload):
        response = requests.post(API_URL0, headers=headers0, json=payload)
        return response.json()
    out = query({"inputs": text, "min_length":300})
    if isinstance(out, list) and out[0].get("summary_text"):
       text_output = out[0]["summary_text"]
       st.success(text_output) 

st.title("Bangla and English Summarizer:")
st.subheader("Please, upload your PDF/Images or input texts to summarize: ")
#@st.cache_resource(experimental_allow_widgets=True)
def main():
    """ NLP Based Application with Streamlit """
    def change_photo_state():
        st.session_state["photo"]="done"
    message = st.text_input("Type your text here!")
    uploaded_photo = st.sidebar.file_uploader("Upload your Images/PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)
    camera_photo = st.sidebar.camera_input("Capture a photo to summarize: ", on_change=change_photo_state)
    if "photo" not in st.session_state:
        st.session_state["photo"]="not done"
    if st.session_state["photo"]=="done" or message:
        if uploaded_photo and uploaded_photo.type=='application/pdf':
            text=None
            #file = uploaded_photo.read() # Read the data
            #image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result
            #image_result.write(file)
            tet = read_pdf(uploaded_photo)
            #tet = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
            values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)])
            st.text("Selected text for summarize: ")
            text = tet[values[0]*7*10:values[1]*7*10] if values[0]!=len(tet)//(7*10) else tet[len(tet)//(7*100):]
            st.success(text)
            st.text("Summarized text: ")
            engsum(text)
        if uploaded_photo and uploaded_photo.type !='application/pdf':
            text=None
            img = Image.open(uploaded_photo)
            img = img.save("img.png")
            img = cv2.imread("img.png")
            st.text("Select the summarization type:")
            if st.button("BENGALI"):
                text =  pytesseract.image_to_string(img, lang="ben")
                bansum(text)
            if st.button("ENGLISH"): 
                text=pytesseract.image_to_string(img)
                engsum(text)
            #st.success(text)
        if camera_photo:
            text=None
            img = Image.open(camera_photo)
            img = img.save("img.png")
            img = cv2.imread("img.png")
            #text = pytesseract.image_to_string(img) if st.checkbox("Bangla") else pytesseract.image_to_string(img, lang="ben")
            st.text("Select the summarization type:")
            if st.button("Bangla"):
                text =  pytesseract.image_to_string(img, lang="ben")
                bansum(text)
            if st.button("English"): 
                text=pytesseract.image_to_string(img)
                engsum(text)
        if message:
            text=None
            text = message
            if st.button("Bangla"):
                bansum(text)
            if st.button("English"): 
                engsum(text)  
        # if st.button("English Text Generation"): 
        #     def query(payload):
        #     	response = requests.post(API_URL2, headers=headers2, json=payload)
        #     	return response.json()
            	
        #     out = query({
        #     	"inputs": text,
        #     })
        #     if isinstance(out, list) and out[0].get("generated_text"):
        #         text_output = out[0]["generated_text"]
        #         st.success(text_output)
        #         #text=text_output

if __name__ == '__main__':
    main()