|
""" |
|
#App: NLP App with Streamlit |
|
Description |
|
This is a Natural Language Processing(NLP) base Application that is useful for |
|
Document/Text Summarization from Bangla images and English Images/PDF files. |
|
""" |
|
|
|
import os |
|
os.environ["TOKENIZERS_PARALLELISM"] = "false" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import streamlit as st |
|
st.set_page_config(page_title="Summarization Tool", layout="wide", initial_sidebar_state="expanded") |
|
import torch |
|
import docx2txt |
|
from PIL import Image |
|
from PyPDF2 import PdfFileReader |
|
from pdf2image import convert_from_bytes |
|
import pdfplumber |
|
|
|
import pdf2image |
|
import requests |
|
import cv2 |
|
import numpy as np |
|
import pytesseract |
|
import line_cor |
|
import altair as alt |
|
|
|
from PIL import Image |
|
API_URL0 = "https://api-inference.huggingface.co/models/csebuetnlp/mT5_multilingual_XLSum" |
|
headers0 = {"Authorization": "Bearer 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"} |
|
API_URL1 = "https://api-inference.huggingface.co/models/Michael-Vptn/text-summarization-t5-base" |
|
headers1 = {"Authorization": "Bearer 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"} |
|
API_URL2 = "https://api-inference.huggingface.co/models/gpt2" |
|
headers2 = {"Authorization": "Bearer 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"} |
|
|
|
def read_pdf(file): |
|
|
|
|
|
pdfReader = PdfFileReader(file) |
|
count = pdfReader.numPages |
|
all_page_text = " " |
|
for i in range(count): |
|
page = pdfReader.getPage(i) |
|
all_page_text += page.extractText()+" " |
|
return all_page_text |
|
|
|
def engsum(output): |
|
def query(payload): |
|
response = requests.post(API_URL1, headers=headers1, json=payload) |
|
return response.json() |
|
|
|
out = query({ |
|
"inputs": output, |
|
}) |
|
if isinstance(out, list) and out[0].get("generated_text"): |
|
text_output = out[0]["generated_text"] |
|
st.success(text_output) |
|
def bansum(text): |
|
def query(payload): |
|
response = requests.post(API_URL0, headers=headers0, json=payload) |
|
return response.json() |
|
out = query({"inputs": text}) |
|
if isinstance(out, list) and out[0].get("generated_text"): |
|
text_output = out[0]["generated_text"] |
|
st.success(text_output) |
|
|
|
st.title("NLP APPLICATION") |
|
|
|
def main(): |
|
s=0 |
|
|
|
|
|
|
|
""" NLP Based Application with Streamlit """ |
|
st.markdown(""" |
|
#### Description |
|
##This is a Natural Language Processing(NLP) base Application that is useful for basic NLP tasks such as follows: |
|
PDF Document/Image's Text Summarization. |
|
""") |
|
def change_photo_state(): |
|
st.session_state["photo"]="done" |
|
st.subheader("Please, feed your pdf/images/text, features/services will appear automatically!") |
|
message = st.text_input("Type your text here!") |
|
uploaded_photo = st.sidebar.file_uploader("Upload your Images/PDF",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state) |
|
camera_photo = st.camera_input("Take a photo, Containing English texts", on_change=change_photo_state) |
|
if "photo" not in st.session_state: |
|
st.session_state["photo"]="not done" |
|
if st.session_state["photo"]=="done" or message: |
|
text=" " |
|
if uploaded_photo and uploaded_photo.type=='application/pdf': |
|
|
|
|
|
|
|
tet = read_pdf(uploaded_photo) |
|
|
|
values = st.slider('Select a approximate number of lines to see and summarize',value=[0, len(tet)//(7*100)]) |
|
text = tet[values[0]*7*10:values[1]*7*10] if values[0]!=len(tet)//(7*10) else tet[len(tet)//(7*100):] |
|
st.success(text) |
|
elif uploaded_photo: |
|
img = Image.open(uploaded_photo) |
|
img = img.save("img.png") |
|
img = cv2.imread("img.png") |
|
st.text("Press the content type:") |
|
if st.button("Content Type: Bangla"): |
|
text = pytesseract.image_to_string(img, lang="ben") |
|
s=1 |
|
if st.button("Content Type: English"): |
|
text=pytesseract.image_to_string(img) |
|
s=0 |
|
|
|
elif camera_photo: |
|
img = Image.open(camera_photo) |
|
img = img.save("img.png") |
|
img = cv2.imread("img.png") |
|
|
|
st.text("Please select the content type:") |
|
if st.button("Content Type: Bangla"): |
|
text = pytesseract.image_to_string(img, lang="ben") |
|
s=1 |
|
if st.button("Content Type: English"): |
|
text=pytesseract.image_to_string(img) |
|
s=0 |
|
st.success(text) |
|
elif uploaded_photo==None and camera_photo==None: |
|
text = message |
|
|
|
if st.checkbox("Mark for Text Summarization"): |
|
if s==1: |
|
bansum(text) |
|
else: |
|
engsum(text) |
|
|
|
if st.checkbox("English Text Generation"): |
|
def query(payload): |
|
response = requests.post(API_URL2, headers=headers2, json=payload) |
|
return response.json() |
|
|
|
out = query({ |
|
"inputs": text, |
|
}) |
|
if isinstance(out, list) and out[0].get("generated_text"): |
|
text_output = out[0]["generated_text"] |
|
st.success(text_output) |
|
if st.button("Refresh"): |
|
text=None |
|
s=0 |
|
if __name__ == '__main__': |
|
main() |
|
|