|
""" |
|
#App: NLP App with Streamlit |
|
Credits: Streamlit Team, Marc Skov Madsen(For Awesome-streamlit gallery) |
|
Description |
|
This is a Natural Language Processing(NLP) base Application that is useful for basic NLP tasks such as follows; |
|
|
|
+ Tokenization(POS tagging) & Lemmatization(root mean) using Spacy |
|
|
|
+ Named Entity Recognition(NER)/Trigger word detection using SpaCy |
|
|
|
+ Sentiment Analysis using TextBlob |
|
|
|
+ Document/Text Summarization using Gensim/T5 both for Bangla Extractive and English Abstructive. |
|
|
|
This is built with Streamlit Framework, an awesome framework for building ML and NLP tools. |
|
Purpose |
|
To perform basic and useful NLP tasks with Streamlit, Spacy, Textblob, and Gensim |
|
""" |
|
|
|
import os |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import streamlit as st |
|
import torch |
|
from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2LMHeadModel |
|
import docx2txt |
|
from PIL import Image |
|
from PyPDF2 import PdfFileReader |
|
from pdf2image import convert_from_bytes |
|
import pdfplumber |
|
|
|
import pdf2image |
|
|
|
|
|
|
|
from textblob import TextBlob |
|
import spacy |
|
from gensim.summarization import summarize |
|
import requests |
|
import cv2 |
|
import numpy as np |
|
import pytesseract |
|
|
|
|
|
from PIL import Image |
|
def mark_region(im): |
|
|
|
|
|
|
|
gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY) |
|
blur = cv2.GaussianBlur(gray, (9,9), 0) |
|
thresh = cv2.adaptiveThreshold(blur,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,11,30) |
|
|
|
|
|
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9,9)) |
|
dilate = cv2.dilate(thresh, kernel, iterations=4) |
|
|
|
|
|
cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) |
|
cnts = cnts[0] if len(cnts) == 2 else cnts[1] |
|
|
|
line_items_coordinates = [] |
|
for c in cnts: |
|
area = cv2.contourArea(c) |
|
x,y,w,h = cv2.boundingRect(c) |
|
|
|
if y >= 600 and x <= 1000: |
|
if area > 10000: |
|
image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3) |
|
line_items_coordinates.append([(x,y), (2200, y+h)]) |
|
|
|
if y >= 2400 and x<= 2000: |
|
image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3) |
|
line_items_coordinates.append([(x,y), (2200, y+h)]) |
|
|
|
|
|
return image, line_items_coordinates |
|
|
|
|
|
|
|
|
|
@st.experimental_singleton |
|
def read_pdf(file): |
|
images=pdf2image.convert_from_path(file) |
|
|
|
|
|
|
|
all_page_text = "" |
|
for page in images: |
|
|
|
|
|
img = Image.open(page) |
|
img = img.save("img.png") |
|
image_name = cv2.imread("img.png") |
|
|
|
text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name) |
|
all_page_text += text + " " |
|
return all_page_text |
|
def read_pdf_with_pdfplumber(file): |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name) |
|
all_page_text += text + " " |
|
return all_page_text |
|
st.title("Streamlit NLP APP") |
|
@st.experimental_singleton |
|
def text_analyzer(my_text): |
|
nlp = spacy.load('en_core_web_sm') |
|
docx = nlp(my_text) |
|
|
|
allData = [('"Token":{},\n"Lemma":{}'.format(token.text,token.lemma_))for token in docx ] |
|
return allData |
|
@st.experimental_singleton |
|
def load_models(): |
|
tokenizer = AutoTokenizer.from_pretrained('gpt2-large') |
|
model = GPT2LMHeadModel.from_pretrained('gpt2-large') |
|
return tokenizer, model |
|
|
|
@st.experimental_singleton |
|
def entity_analyzer(my_text): |
|
nlp = spacy.load('en_core_web_sm') |
|
docx = nlp(my_text) |
|
tokens = [ token.text for token in docx] |
|
entities = [(entity.text,entity.label_)for entity in docx.ents] |
|
allData = ['"Token":{},\n"Entities":{}'.format(tokens,entities)] |
|
return allData |
|
def main(): |
|
""" NLP Based Application with Streamlit """ |
|
st.markdown(""" |
|
#### Description |
|
##This is a Natural Language Processing(NLP) base Application that is useful for basic NLP tasks such as follows: |
|
+ Tokenization(POS tagging) & Lemmatization(root mean) using Spacy |
|
+ Named Entity Recognition(NER)/Trigger word detection using SpaCy |
|
+ Sentiment Analysis using TextBlob |
|
+ Document/Text Summarization using Gensim/T5 both for Bangla Extractive and English Abstractive. |
|
""") |
|
def change_photo_state(): |
|
st.session_state["photo"]="done" |
|
st.subheader("Please, feed your image/text, features/services will appear automatically!") |
|
message = st.text_input("Type your text here!") |
|
camera_photo = st.camera_input("Take a photo, Containing English or Bangla texts", on_change=change_photo_state) |
|
uploaded_photo = st.file_uploader("Upload Image/PDF, Containing English or Bangla texts",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state) |
|
if "photo" not in st.session_state: |
|
st.session_state["photo"]="not done" |
|
if st.session_state["photo"]=="done" or message: |
|
|
|
if uploaded_photo.type=='application/pdf': |
|
file = uploaded_photo.read() |
|
image_result = open(uploaded_photo.name, 'wb') |
|
image_result.write(file) |
|
text = read_pdf(image_result) |
|
|
|
st.success(text) |
|
elif uploaded_photo.type != "application/image": |
|
img = Image.open(uploaded_photo) |
|
img = img.save("img.png") |
|
img = cv2.imread("img.png") |
|
|
|
image, lc = mark_region(img) |
|
c = lc[1] |
|
|
|
img = image[c[0][1]:c[1][1], c[0][0]:c[1][0]] |
|
plt.figure(figsize=(10,10)) |
|
plt.imshow(img) |
|
|
|
ret,thresh1 = cv2.threshold(img,120,255,cv2.THRESH_BINARY) |
|
|
|
text = str(pytesseract.image_to_string(thresh1, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6')) |
|
|
|
st.success(text) |
|
elif camera_photo: |
|
img = Image.open(camera_photo) |
|
img = img.save("img.png") |
|
img = cv2.imread("img.png") |
|
text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img) |
|
st.success(text) |
|
elif uploaded_photo==None and camera_photo==None: |
|
|
|
|
|
text = message |
|
if st.checkbox("Show Named Entities English/Bangla"): |
|
entity_result = entity_analyzer(text) |
|
st.json(entity_result) |
|
if st.checkbox("Show Sentiment Analysis for English"): |
|
blob = TextBlob(text) |
|
result_sentiment = blob.sentiment |
|
st.success(result_sentiment) |
|
if st.checkbox("Spell Corrections for English"): |
|
st.success(TextBlob(text).correct()) |
|
if st.checkbox("Text Generation"): |
|
ok = st.button("Generate") |
|
if ok: |
|
tokenizer, model = load_models() |
|
input_ids = tokenizer(text, return_tensors='pt').input_ids |
|
st.text("Using Hugging Face Transformer, Contrastive Search ..") |
|
output = model.generate(input_ids, max_length=128) |
|
st.success(tokenizer.decode(output[0], skip_special_tokens=True)) |
|
if st.checkbox("Mark here, Text Summarization for English or Bangla!"): |
|
|
|
|
|
|
|
|
|
summary_result = summarize(text) |
|
st.success(summary_result) |
|
if st.checkbox("Mark to better English Text Summarization!"): |
|
|
|
tokenizer = AutoTokenizer.from_pretrained('t5-base') |
|
model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True) |
|
|
|
inputs = tokenizer.encode("summarize: " + text, |
|
return_tensors='pt', |
|
max_length=512, |
|
truncation=True) |
|
summary_ids = model.generate(inputs, max_length=150, min_length=80, length_penalty=5., num_beams=2) |
|
summary = tokenizer.decode(summary_ids[0]) |
|
st.success(summary) |
|
|
|
if st.button("REFRESH"): |
|
st.experimental_rerun() |
|
|
|
st.sidebar.subheader("About App") |
|
st.sidebar.markdown("By [Soumen Sarker](https://soumen-sarker-personal-website.streamlitapp.com/)") |
|
if __name__ == '__main__': |
|
main() |
|
|