Spaces:

Soumen
/

Text-Summarization-and-NLP-tasks

Sleeping

File size: 6,591 Bytes

9c37e72
dba2773
9c37e72
68f40bc
d82d18a
9c37e72
 
6e58c44
4834995
bd18577
 
 
 
 
 
 
2f51bd6
c75cc74
3d7adba
9c37e72
09d4214
 
 
06dd768
09d4214
0842639
baf370a
9c37e72
 
 
 
1ecea99
fa73ddc
419e04c
9c37e72
a07988a
 
b446f5c
a07988a
b446f5c
 
dd55b25
8c11fa3
 
f176b84
dd55b25
dec4937
6fe4e2e
dd55b25
 
 
 
 
 
8c11fa3
dd55b25
10ef8bd
d82d18a
 
 
 
 
 
1a9fa41
d82d18a
 
 
1a9fa41
 
c6ec27d
c9a18bc
9c37e72
a07988a
 
 
e113d20
3f6c2be
9c37e72
1a16a58
3d7adba
f1ae271
 
 
3d7adba
0a7287e
2c22d61
6b10636
63c4e55
 
 
cb6a8b6
2bfe916
7b01ac0
 
 
d82d18a
ea8c799
82d1709
6d5b302
d82d18a
e8b7e7a
8cc1e8b
6e25163
dd55b25
3e4f1f9
dd55b25
ed0375d
dd55b25
3e4f1f9
8c11fa3
f5aabdb
 
3e4f1f9
dd55b25
3e4f1f9
886ee3f
7c894b1
6c1c515
1e17e2b
cd370f7
 
 
7c894b1
6c1c515
1e17e2b
cd370f7
 
 
3d7adba
7c894b1
7af9178
 
 
 
 
 
 
 
7c894b1
a07988a
 
 
 
 
 
9c37e72
b9b4937

"""
#App: NLP App with Streamlit
Description
This is a Natural Language Processing(NLP) base Application that is useful for 
Document/Text Summarization from Bangla images and English Images/PDF files.
"""
# Core Pkgs
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
#os.system('sudo apt-get install tesseract-ocr-eng')
#os.system('sudo apt-get install tesseract-ocr-ben')

#os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
#os.system('gunzip ben.traineddata.gz ')
#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
#os.system('pip install -q pytesseract')
#os.system('conda install -c conda-forge poppler')
import streamlit as st
st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sidebar_state="expanded")
import torch
import docx2txt
from PIL import Image 
from PyPDF2 import PdfFileReader
from pdf2image import convert_from_bytes
import pdfplumber
#from line_cor import mark_region
import pdf2image
import requests
import cv2
import numpy as np
import pytesseract
import line_cor
import altair as alt
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
from PIL import Image
API_URL = "https://api-inference.huggingface.co/models/gpt2-large"
headers = {"Authorization": "Bearer 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"}
API_URL0 = "https://api-inference.huggingface.co/models/csebuetnlp/mT5_multilingual_XLSum"
headers0 = {"Authorization": "Bearer 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"}

	
def read_pdf(file):
#     images=pdf2image.convert_from_path(file)
#     # print(type(images))
    pdfReader = PdfFileReader(file)
    count = pdfReader.numPages
    all_page_text = " "
    for i in range(count):
        page = pdfReader.getPage(i)
        # img = Image.open(page)
        # img = Image.open(page)
        # img = img.save("img.png")
        # image_name = cv2.imread("img.png")
        # # get co-ordinates to cr
#         text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name)
        all_page_text += page.extractText()+" "
    return all_page_text
# def read_pdf_with_pdfplumber(file):
#     # Open the uploaded PDF file with pdfplumber
#     with pdfplumber.open(file) as pdf:
#         extracted_text = ''
#         for page in pdf.pages:
#             extracted_text += page.extract_text()

#     # Display the extracted text
#     #st.text(extracted_text)
#     return extracted_text


st.title("NLP APPLICATION")
#@st.cache_resource(experimental_allow_widgets=True)
def main():
    #global tokenizer, model
    #tokenizer = AutoTokenizer.from_pretrained('t5-base')
    #model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)
    """ NLP Based Application with Streamlit """
    st.markdown("""
    	#### Description
    	##This is a Natural Language Processing(NLP) base Application that is useful for basic NLP tasks such as follows:
        PDF Document/Image's Text Summarization.
    	""")
    def change_photo_state():
        st.session_state["photo"]="done"
    st.subheader("Please, feed your pdf/images/text, features/services will appear automatically!")
    message = st.text_input("Type your text here!")
    uploaded_photo = st.file_uploader("Upload your PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)
    camera_photo = st.camera_input("Take a photo, Containing English texts", on_change=change_photo_state)
    if "photo" not in st.session_state:
        st.session_state["photo"]="not done"
    if st.session_state["photo"]=="done" or message:
        text=" "
        if uploaded_photo and uploaded_photo.type=='application/pdf':
            #file = uploaded_photo.read() # Read the data
            #image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result
            #image_result.write(file)
            tet = read_pdf(uploaded_photo)
            #tet = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
            values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)])
            text = tet[values[0]*7*10:values[1]*7*10] if values[0]!=len(tet)//(7*10) else tet[len(tet)//(7*100):]
            st.success(text)
        elif uploaded_photo:
            img = Image.open(uploaded_photo)
            img = img.save("img.png")
            img = cv2.imread("img.png")
            # get co-ordinates to crop the image
            #imag, lc = line_cor.mark_region(imge)
            #st.success(*lc)
           # c = lc
            # cropping image img = image[y0:y1, x0:x1]
            #imgg = imge[c[0][1]:c[1][1], c[0][0]:c[1][0]]
            #plt.figure(figsize=(10,10))
           # plt.imshow(img)
            # convert the image to black and white for better OCR
            #ret,thresh1 = cv2.threshold(imge,120,255,cv2.THRESH_BINARY)
            # pytesseract image to string to get results
            #text = str(pytesseract.image_to_string(img, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6'))
            text =  pytesseract.image_to_string(img, lang="ben") if st.checkbox("Bangla") else pytesseract.image_to_string(img)
            #st.success(text)
        elif camera_photo:
            img = Image.open(camera_photo)
            img = img.save("img.png")
            img = cv2.imread("img.png")
            text = pytesseract.image_to_string(img) if st.checkbox("Bangla") else pytesseract.image_to_string(img, lang="ben")
            #st.success(text)
        elif uploaded_photo==None and camera_photo==None:
    	#our_image=load_image("image.jpg")
        #img = cv2.imread("scholarly_text.jpg")
            text = message

        if st.checkbox("English Text Generation"): 
            def query(payload):
            	response = requests.post(API_URL, headers=headers, json=payload)
            	return response.json()
            	
            output = query({
            	"inputs": text,
            })
            st.success(output)
        if st.checkbox("Mark for ENG Text Summarization, ENSUER MARKING ABOVE BANGLA FOR BANGLA!"):
            def query(payload):
                response = requests.post(API_URL0, headers=headers0, json=payload)
                return response.json()
            output = query({
                "inputs": text})
            st.success(output)
if __name__ == '__main__':
    main()