""" #App: NLP App with Streamlit Credits: Streamlit Team, Marc Skov Madsen(For Awesome-streamlit gallery) Description This is a Natural Language Processing(NLP) base Application that is useful for basic NLP tasks such as follows; + Tokenization(POS tagging) & Lemmatization(root mean) using Spacy + Named Entity Recognition(NER)/Trigger word detection using SpaCy + Sentiment Analysis using TextBlob + Document/Text Summarization using Gensim/T5 both for Bangla Extractive and English Abstructive. This is built with Streamlit Framework, an awesome framework for building ML and NLP tools. Purpose To perform basic and useful NLP tasks with Streamlit, Spacy, Textblob, and Gensim """ # Core Pkgs import os #os.system('sudo apt-get install tesseract-ocr-eng') #os.system('sudo apt-get install tesseract-ocr-ben') #os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata') #os.system('gunzip ben.traineddata.gz ') #os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/') #os.system('pip install -q pytesseract') import streamlit as st import torch from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2LMHeadModel import docx2txt from PIL import Image from PyPDF2 import PdfFileReader from pdf2image import convert_from_bytes import pdfplumber #from line_cor import mark_region import pdf2image # NLP Pkgs from textblob import TextBlob import spacy from gensim.summarization import summarize import requests import cv2 import numpy as np import pytesseract #pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe" from PIL import Image def mark_region(im): #im = cv2.imread(image_path) gray = cv2.cvtColor(im, cv2.COLOR_BGR2GRAY) blur = cv2.GaussianBlur(gray, (9,9), 0) thresh = cv2.adaptiveThreshold(blur,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY_INV,11,30) # Dilate to combine adjacent text contours kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9,9)) dilate = cv2.dilate(thresh, kernel, iterations=4) # Find contours, highlight text areas, and extract ROIs cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) cnts = cnts[0] if len(cnts) == 2 else cnts[1] line_items_coordinates = [] for c in cnts: area = cv2.contourArea(c) x,y,w,h = cv2.boundingRect(c) if y >= 600 and x <= 1000: if area > 10000: image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3) line_items_coordinates.append([(x,y), (2200, y+h)]) if y >= 2400 and x<= 2000: image = cv2.rectangle(im, (x,y), (2200, y+h), color=(255,0,255), thickness=3) line_items_coordinates.append([(x,y), (2200, y+h)]) return image, line_items_coordinates @st.experimental_singleton def read_pdf(file): images=pdf2image.convert_from_path(file) # print(type(images)) # pdfReader = PdfFileReader(file) # count = pdfReader.numPages all_page_text = "" for page in images: # page = pdfReader.getPage(i) #img = Image.open(page) img = Image.open(page) img = img.save("img.png") image_name = cv2.imread("img.png") # get co-ordinates to cr text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name) all_page_text += text + " " #page.extractText() return all_page_text def read_pdf_with_pdfplumber(file): # all_page_text=" " # # all_page_text = "" # #with pdfplumber.open(file) as pdf: # # page = pdf.pages[0] # ge=page.to_image() # img = Image.open(ge) # img = img.save("img.png") # image_name = cv2.imread("img.png") # get co-ordinates to c #return page.extract_text() # get co-ordinates to cr ## get co-ordinates to cr text = pytesseract.image_to_string(image_name, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(image_name) all_page_text += text + " " #page.extractText() return all_page_text st.title("Streamlit NLP APP") @st.experimental_singleton def text_analyzer(my_text): nlp = spacy.load('en_core_web_sm') docx = nlp(my_text) # tokens = [ token.text for token in docx] allData = [('"Token":{},\n"Lemma":{}'.format(token.text,token.lemma_))for token in docx ] return allData @st.experimental_singleton def load_models(): tokenizer = AutoTokenizer.from_pretrained('gpt2-large') model = GPT2LMHeadModel.from_pretrained('gpt2-large') return tokenizer, model # Function For Extracting Entities @st.experimental_singleton def entity_analyzer(my_text): nlp = spacy.load('en_core_web_sm') docx = nlp(my_text) tokens = [ token.text for token in docx] entities = [(entity.text,entity.label_)for entity in docx.ents] allData = ['"Token":{},\n"Entities":{}'.format(tokens,entities)] return allData def main(): """ NLP Based Application with Streamlit """ st.markdown(""" #### Description ##This is a Natural Language Processing(NLP) base Application that is useful for basic NLP tasks such as follows: + Tokenization(POS tagging) & Lemmatization(root mean) using Spacy + Named Entity Recognition(NER)/Trigger word detection using SpaCy + Sentiment Analysis using TextBlob + Document/Text Summarization using Gensim/T5 both for Bangla Extractive and English Abstractive. """) def change_photo_state(): st.session_state["photo"]="done" st.subheader("Please, feed your image/text, features/services will appear automatically!") message = st.text_input("Type your text here!") camera_photo = st.camera_input("Take a photo, Containing English or Bangla texts", on_change=change_photo_state) uploaded_photo = st.file_uploader("Upload Image/PDF, Containing English or Bangla texts",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state) if "photo" not in st.session_state: st.session_state["photo"]="not done" if st.session_state["photo"]=="done" or message: #text="" if uploaded_photo.type=='application/pdf': file = uploaded_photo.read() # Read the data image_result = open(uploaded_photo.name, 'wb') # creates a writable image and later we can write the decoded result image_result.write(file) text = read_pdf(image_result) #text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img) st.success(text) elif uploaded_photo.type != "application/image": img = Image.open(uploaded_photo) img = img.save("img.png") img = cv2.imread("img.png") # get co-ordinates to crop the image image, lc = mark_region(img) c = lc[1] # cropping image img = image[y0:y1, x0:x1] img = image[c[0][1]:c[1][1], c[0][0]:c[1][0]] plt.figure(figsize=(10,10)) plt.imshow(img) # convert the image to black and white for better OCR ret,thresh1 = cv2.threshold(img,120,255,cv2.THRESH_BINARY) # pytesseract image to string to get results text = str(pytesseract.image_to_string(thresh1, config='--psm 6',lang="ben")) if st.checkbox("Bangla") else str(pytesseract.image_to_string(thresh1, config='--psm 6')) #text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img) st.success(text) elif camera_photo: img = Image.open(camera_photo) img = img.save("img.png") img = cv2.imread("img.png") text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img) st.success(text) elif uploaded_photo==None and camera_photo==None: #our_image=load_image("image.jpg") #img = cv2.imread("scholarly_text.jpg") text = message if st.checkbox("Show Named Entities English/Bangla"): entity_result = entity_analyzer(text) st.json(entity_result) if st.checkbox("Show Sentiment Analysis for English"): blob = TextBlob(text) result_sentiment = blob.sentiment st.success(result_sentiment) if st.checkbox("Spell Corrections for English"): st.success(TextBlob(text).correct()) if st.checkbox("Text Generation"): ok = st.button("Generate") if ok: tokenizer, model = load_models() input_ids = tokenizer(text, return_tensors='pt').input_ids st.text("Using Hugging Face Transformer, Contrastive Search ..") output = model.generate(input_ids, max_length=128) st.success(tokenizer.decode(output[0], skip_special_tokens=True)) if st.checkbox("Mark here, Text Summarization for English or Bangla!"): #st.subheader("Summarize Your Text for English and Bangla Texts!") #message = st.text_area("Enter the Text","Type please ..") #st.text("Using Gensim Summarizer ..") #st.success(mess) summary_result = summarize(text) st.success(summary_result) if st.checkbox("Mark to better English Text Summarization!"): #st.title("Summarize Your Text for English only!") tokenizer = AutoTokenizer.from_pretrained('t5-base') model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True) #st.text("Using Google T5 Transformer ..") inputs = tokenizer.encode("summarize: " + text, return_tensors='pt', max_length=512, truncation=True) summary_ids = model.generate(inputs, max_length=150, min_length=80, length_penalty=5., num_beams=2) summary = tokenizer.decode(summary_ids[0]) st.success(summary) # Title if st.button("REFRESH"): st.experimental_rerun() st.sidebar.subheader("About App") st.sidebar.markdown("By [Soumen Sarker](https://soumen-sarker-personal-website.streamlitapp.com/)") if __name__ == '__main__': main()