|
""" |
|
#App: NLP App with Streamlit |
|
Credits: Streamlit Team, Marc Skov Madsen(For Awesome-streamlit gallery) |
|
Description |
|
This is a Natural Language Processing(NLP) base Application that is useful for basic NLP tasks such as follows; |
|
|
|
+ Tokenization(POS tagging) & Lemmatization(root mean) using Spacy |
|
|
|
+ Named Entity Recognition(NER)/Trigger word detection using SpaCy |
|
|
|
+ Sentiment Analysis using TextBlob |
|
|
|
+ Document/Text Summarization using Gensim/T5 both for Bangla Extractive and English Abstructive. |
|
|
|
This is built with Streamlit Framework, an awesome framework for building ML and NLP tools. |
|
Purpose |
|
To perform basic and useful NLP tasks with Streamlit, Spacy, Textblob, and Gensim |
|
""" |
|
|
|
import os |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import streamlit as st |
|
import torch |
|
from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2LMHeadModel |
|
import docx2txt |
|
from PIL import Image |
|
from PyPDF2 import PdfFileReader |
|
import pdfplumber |
|
|
|
|
|
from textblob import TextBlob |
|
import spacy |
|
from gensim.summarization import summarize |
|
import requests |
|
import cv2 |
|
import numpy as np |
|
import pytesseract |
|
|
|
from PIL import Image |
|
def read_pdf(file): |
|
pdfReader = PdfFileReader(file) |
|
count = pdfReader.numPages |
|
all_page_text = "" |
|
for i in range(count): |
|
page = pdfReader.getPage(i) |
|
all_page_text += page.extractText() |
|
|
|
return all_page_text |
|
|
|
|
|
|
|
|
|
|
|
st.title("Streamlit NLP APP") |
|
@st.experimental_singleton |
|
def text_analyzer(my_text): |
|
nlp = spacy.load('en_core_web_sm') |
|
docx = nlp(my_text) |
|
|
|
allData = [('"Token":{},\n"Lemma":{}'.format(token.text,token.lemma_))for token in docx ] |
|
return allData |
|
@st.experimental_singleton |
|
def load_models(): |
|
tokenizer = AutoTokenizer.from_pretrained('gpt2-large') |
|
model = GPT2LMHeadModel.from_pretrained('gpt2-large') |
|
return tokenizer, model |
|
|
|
@st.experimental_singleton |
|
def entity_analyzer(my_text): |
|
nlp = spacy.load('en_core_web_sm') |
|
docx = nlp(my_text) |
|
tokens = [ token.text for token in docx] |
|
entities = [(entity.text,entity.label_)for entity in docx.ents] |
|
allData = ['"Token":{},\n"Entities":{}'.format(tokens,entities)] |
|
return allData |
|
def main(): |
|
""" NLP Based Application with Streamlit """ |
|
st.markdown(""" |
|
#### Description |
|
##This is a Natural Language Processing(NLP) base Application that is useful for basic NLP tasks such as follows: |
|
+ Tokenization(POS tagging) & Lemmatization(root mean) using Spacy |
|
+ Named Entity Recognition(NER)/Trigger word detection using SpaCy |
|
+ Sentiment Analysis using TextBlob |
|
+ Document/Text Summarization using Gensim/T5 both for Bangla Extractive and English Abstractive. |
|
""") |
|
def change_photo_state(): |
|
st.session_state["photo"]="done" |
|
st.subheader("Please, feed your image/text, features/services will appear automatically!") |
|
message = st.text_input("Type your text here!") |
|
camera_photo = st.camera_input("Take a photo, Containing English or Bangla texts", on_change=change_photo_state) |
|
uploaded_photo = st.file_uploader("Upload Image/PDF, Containing English or Bangla texts",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state) |
|
if "photo" not in st.session_state: |
|
st.session_state["photo"]="not done" |
|
if st.session_state["photo"]=="done" or message: |
|
|
|
if uploaded_photo.type=='application/pdf': |
|
text = read_pdf(uploaded_photo) |
|
|
|
st.success(text) |
|
elif uploaded_photo.type != "application/image": |
|
img = Image.open(uploaded_photo) |
|
img = img.save("img.png") |
|
img = cv2.imread("img.png") |
|
text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img) |
|
st.success(text) |
|
elif camera_photo: |
|
img = Image.open(camera_photo) |
|
img = img.save("img.png") |
|
img = cv2.imread("img.png") |
|
text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img) |
|
st.success(text) |
|
elif uploaded_photo==None and camera_photo==None: |
|
|
|
|
|
text = message |
|
if st.checkbox("Show Named Entities English/Bangla"): |
|
entity_result = entity_analyzer(text) |
|
st.json(entity_result) |
|
if st.checkbox("Show Sentiment Analysis for English"): |
|
blob = TextBlob(text) |
|
result_sentiment = blob.sentiment |
|
st.success(result_sentiment) |
|
if st.checkbox("Spell Corrections for English"): |
|
st.success(TextBlob(text).correct()) |
|
if st.checkbox("Text Generation"): |
|
ok = st.button("Generate") |
|
if ok: |
|
tokenizer, model = load_models() |
|
input_ids = tokenizer(text, return_tensors='pt').input_ids |
|
st.text("Using Hugging Face Transformer, Contrastive Search ..") |
|
output = model.generate(input_ids, max_length=128) |
|
st.success(tokenizer.decode(output[0], skip_special_tokens=True)) |
|
if st.checkbox("Mark here, Text Summarization for English or Bangla!"): |
|
|
|
|
|
|
|
|
|
summary_result = summarize(text) |
|
st.success(summary_result) |
|
if st.checkbox("Mark to better English Text Summarization!"): |
|
|
|
tokenizer = AutoTokenizer.from_pretrained('t5-base') |
|
model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True) |
|
|
|
inputs = tokenizer.encode("summarize: " + text, |
|
return_tensors='pt', |
|
max_length=512, |
|
truncation=True) |
|
summary_ids = model.generate(inputs, max_length=150, min_length=80, length_penalty=5., num_beams=2) |
|
summary = tokenizer.decode(summary_ids[0]) |
|
st.success(summary) |
|
|
|
if st.button("REFRESH"): |
|
st.experimental_rerun() |
|
|
|
st.sidebar.subheader("About App") |
|
st.sidebar.markdown("By [Soumen Sarker](https://soumen-sarker-personal-website.streamlitapp.com/)") |
|
if __name__ == '__main__': |
|
main() |
|
|