""" #App: NLP App with Streamlit Credits: Streamlit Team, Marc Skov Madsen(For Awesome-streamlit gallery) Description This is a Natural Language Processing(NLP) Based App useful for basic NLP concepts such as follows; + Tokenization & Lemmatization using Spacy + Named Entity Recognition(NER) using SpaCy + Sentiment Analysis using TextBlob + Document/Text Summarization using Gensim/T5 for both Bangla and english This is built with Streamlit Framework, an awesome framework for building ML and NLP tools. Purpose To perform basic and useful NLP task with Streamlit, Spacy, Textblob and Gensim """ # Core Pkgs import os #os.system('sudo apt-get install tesseract-ocr-eng') #os.system('sudo apt-get install tesseract-ocr-ben') #os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata') #os.system('gunzip ben.traineddata.gz ') #os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/') #os.system('pip install -q pytesseract') import streamlit as st import torch from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2LMHeadModel # NLP Pkgs from textblob import TextBlob import spacy from gensim.summarization import summarize import requests import cv2 import numpy as np import pytesseract #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe" from PIL import Image # Title st.title("Streamlit NLP APP") @st.experimental_singleton def text_analyzer(my_text): nlp = spacy.load('en_core_web_sm') docx = nlp(my_text) # tokens = [ token.text for token in docx] allData = [('"Token":{},\n"Lemma":{}'.format(token.text,token.lemma_))for token in docx ] return allData @st.experimental_singleton def load_models(): tokenizer = AutoTokenizer.from_pretrained('gpt2-large') model = GPT2LMHeadModel.from_pretrained('gpt2-large') return tokenizer, model # Function For Extracting Entities @st.experimental_singleton def entity_analyzer(my_text): nlp = spacy.load('en_core_web_sm') docx = nlp(my_text) tokens = [ token.text for token in docx] entities = [(entity.text,entity.label_)for entity in docx.ents] allData = ['"Token":{},\n"Entities":{}'.format(tokens,entities)] return allData def main(): """ NLP Based Application with Streamlit """ st.markdown(""" #### Description This is a Natural Language Processing(NLP) Based Application useful for basic NLP tasks Named Entity Recognition, Sentiment Analysis, Spell Corrections, Human Level Text Generation, and Summarization """) def change_photo_state(): st.session_state["photo"]="done" st.subheader("Summary section, feed your image!") camera_photo = st.camera_input("Take a photo, Containing English or Bangla texts", on_change=change_photo_state) uploaded_photo = st.file_uploader("Upload Image, Containing English or Bangla texts",type=['jpg','png','jpeg'], on_change=change_photo_state) message = st.text_input("Or, drop your text here, only English text!") if "photo" not in st.session_state: st.session_state["photo"]="not done" if st.session_state["photo"]=="done" or message: if uploaded_photo: img = Image.open(uploaded_photo) img = img.save("img.png") img = cv2.imread("img.png") text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark here to see in Bangla for Bangla Images only") else pytesseract.image_to_string(img) st.success(text) if camera_photo: img = Image.open(camera_photo) img = img.save("img.png") img = cv2.imread("img.png") text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark here to see Bangla") else pytesseract.image_to_string(img) st.success(text) if uploaded_photo==None and camera_photo==None: #our_image=load_image("image.jpg") #img = cv2.imread("scholarly_text.jpg") text = message if st.checkbox("Show Named Entities"): entity_result = entity_analyzer(text) st.json(entity_result) if st.checkbox("Show Sentiment Analysis"): blob = TextBlob(text) result_sentiment = blob.sentiment st.success(result_sentiment) if st.checkbox("Spell Corrections"): st.success(TextBlob(text).correct()) if st.checkbox("Text Generation"): ok = st.button("Generate") tokenizer, model = load_models() if ok: input_ids = tokenizer(text, return_tensors='pt').input_ids st.text("Using Hugging Face Transformer, Contrastive Search ..") output = model.generate(input_ids, max_length=128) st.success(tokenizer.decode(output[0], skip_special_tokens=True)) # Summarization if st.checkbox("Mark here, Text Summarization for English or Bangla!"): #st.subheader("Summarize Your Text for English and Bangla Texts!") #message = st.text_area("Enter the Text","Type please ..") #st.text("Using Gensim Summarizer ..") #st.success(mess) summary_result = summarize(text) st.success(summary_result) elif st.checkbox("Mark here, Better Text Summarization for English only!"): #st.title("Summarize Your Text for English only!") tokenizer = AutoTokenizer.from_pretrained('t5-base') model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True) #st.text("Using Google T5 Transformer ..") inputs = tokenizer.encode("summarize: " + text, return_tensors='pt', max_length=512, truncation=True) summary_ids = model.generate(inputs, max_length=150, min_length=80, length_penalty=5., num_beams=2) summary = tokenizer.decode(summary_ids[0]) st.success(summary) st.experimental_rerun() st.sidebar.subheader("About App") st.sidebar.subheader("By") st.sidebar.text("Soumen Sarker") if __name__ == '__main__': main()