Spaces:

Soumen
/

Text-Summarization-and-NLP-tasks

Running

File size: 6,825 Bytes

9c37e72
dba2773
 
9c37e72
1a16a58
9c37e72
0c5b55b
9c37e72
0c5b55b
9c37e72
 
 
1a16a58
9c37e72
 
 
0c5b55b
9c37e72
 
6e58c44
bd18577
 
 
 
 
 
 
c75cc74
9c37e72
36603f5
09d4214
 
 
 
9531d63
9c37e72
 
 
 
 
 
 
 
419e04c
9c37e72
09d4214
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54ee49c
1a16a58
 
 
54ee49c
f6a6e42
9c37e72
 
 
 
 
 
9531d63
 
 
 
 
9c37e72
f6a6e42
9c37e72
 
 
 
 
 
 
 
333985e
9c37e72
 
1a16a58
 
 
 
 
bb357c0
9c37e72
 
6f9931f
3dedd9a
9222805
09d4214
9c37e72
 
 
 
8cc1e8b
 
 
 
 
 
 
 
 
c08e6a6
b9c9fb8
d048358
 
 
 
 
1babbd0
9c37e72
 
d048358
9332667
b046f0b
6e161b2
9332667
b3eadf5
 
 
9332667
96ca6f6
57054ab
 
7dea7b8
7fbea79
7dea7b8
 
 
 
dba2773
c95ac40
9c37e72
c95ac40
9c37e72
 
 
d5f01ec
c95ac40
1272866
fd37fd8
c95ac40
9c37e72
 
 
 
 
 
 
 
 
7008c69
9c37e72

"""
#App: NLP App with Streamlit
Credits: Streamlit Team, Marc Skov Madsen(For Awesome-streamlit gallery)
Description
This is a Natural Language Processing(NLP) base Application that is useful for basic NLP tasks such as follows;

+ Tokenization(POS tagging) & Lemmatization(root mean) using Spacy

+ Named Entity Recognition(NER)/Trigger word detection using SpaCy

+ Sentiment Analysis using TextBlob

+ Document/Text Summarization using Gensim/T5 both for Bangla Extractive and English Abstructive.

This is built with Streamlit Framework, an awesome framework for building ML and NLP tools.
Purpose
To perform basic and useful NLP tasks with Streamlit, Spacy, Textblob, and Gensim
"""
# Core Pkgs
import os
#os.system('sudo apt-get install tesseract-ocr-eng')
#os.system('sudo apt-get install tesseract-ocr-ben')

#os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
#os.system('gunzip ben.traineddata.gz ')
#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
#os.system('pip install -q pytesseract')
import streamlit as st
import torch
from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2LMHeadModel
import docx2txt
from PIL import Image 
from PyPDF2 import PdfFileReader
import pdfplumber

# NLP Pkgs
from textblob import TextBlob 
import spacy
from gensim.summarization import summarize
import requests
import cv2
import numpy as np
import pytesseract
#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
from PIL import Image
def read_pdf(file):
	pdfReader = PdfFileReader(file)
	count = pdfReader.numPages
	all_page_text = ""
	for i in range(count):
		page = pdfReader.getPage(i)
		all_page_text += page.extractText()

	return all_page_text
	
#def read_pdf_with_pdfplumber(file):
#	with pdfplumber.open(file) as pdf:
#	    page = pdf.pages[0]
#	    return page.extract_text()


# Title
if st.button("REFRESH"):
   st.experimental_rerun()

st.title("Streamlit NLP APP")
@st.experimental_singleton
def text_analyzer(my_text):
	nlp = spacy.load('en_core_web_sm')
	docx = nlp(my_text)
	# tokens = [ token.text for token in docx]
	allData = [('"Token":{},\n"Lemma":{}'.format(token.text,token.lemma_))for token in docx ]
	return allData
@st.experimental_singleton
def load_models():
    tokenizer = AutoTokenizer.from_pretrained('gpt2-large')
    model = GPT2LMHeadModel.from_pretrained('gpt2-large')
    return tokenizer, model
# Function For Extracting Entities
@st.experimental_singleton
def entity_analyzer(my_text):
	nlp = spacy.load('en_core_web_sm')
	docx = nlp(my_text)
	tokens = [ token.text for token in docx]
	entities = [(entity.text,entity.label_)for entity in docx.ents]
	allData = ['"Token":{},\n"Entities":{}'.format(tokens,entities)]
	return allData
def main():
	""" NLP Based Application with Streamlit """
	st.markdown("""
    	#### Description
    	##This is a Natural Language Processing(NLP) base Application that is useful for basic NLP tasks such as follows:
+ Tokenization(POS tagging) & Lemmatization(root mean) using Spacy
+ Named Entity Recognition(NER)/Trigger word detection using SpaCy
+ Sentiment Analysis using TextBlob
+ Document/Text Summarization using Gensim/T5 both for Bangla Extractive and English Abstractive.
    	""")                         
	def change_photo_state():
		st.session_state["photo"]="done"
	st.subheader("Please, feed your image/text, features/services will appear automatically!")
	message = st.text_input("Type your text here!")
	camera_photo = st.camera_input("Take a photo, Containing English or Bangla texts", on_change=change_photo_state)
	uploaded_photo = st.file_uploader("Upload Image, Containing English or Bangla texts",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)
	if "photo" not in st.session_state:
		st.session_state["photo"]="not done"

	if st.session_state["photo"]=="done" or message:
        if uploaded_photo.type == "application/pdf":
            text = read_pdf(docx_file)
            text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
            st.success(text)
        elif uploaded_photo.type=="application/image":
            img = Image.open(uploaded_photo)
            img = img.save("img.png")
            img = cv2.imread("img.png")
            text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
            st.success(text)
        elif camera_photo:
		   img = Image.open(camera_photo)
		   img = img.save("img.png")
		   img = cv2.imread("img.png")
		   text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
		   st.success(text)
		elif uploaded_photo==None and camera_photo==None:
			#our_image=load_image("image.jpg")
			#img = cv2.imread("scholarly_text.jpg")
		   text = message
		if st.checkbox("Show Named Entities English/Bangla"):
		   entity_result = entity_analyzer(text)
		   st.json(entity_result)
		if st.checkbox("Show Sentiment Analysis for English"):
		   blob = TextBlob(text)
		   result_sentiment = blob.sentiment
		   st.success(result_sentiment)
		if st.checkbox("Spell Corrections for English"):
		   st.success(TextBlob(text).correct())
		if st.checkbox("Text Generation"):
		   ok = st.button("Generate")
		   if ok:
		      tokenizer, model = load_models()
		      input_ids = tokenizer(text, return_tensors='pt').input_ids
		      st.text("Using Hugging Face Transformer, Contrastive Search ..")
		      output = model.generate(input_ids, max_length=128)
		      st.success(tokenizer.decode(output[0], skip_special_tokens=True))
		if st.checkbox("Mark here, Text Summarization for English or Bangla!"):
			#st.subheader("Summarize Your Text for English and Bangla Texts!")
			#message = st.text_area("Enter the Text","Type please ..")
			#st.text("Using Gensim Summarizer ..")
			#st.success(mess)
			summary_result = summarize(text)
			st.success(summary_result)
		if st.checkbox("Mark to better English Text Summarization!"):
			#st.title("Summarize Your Text for English only!")
			tokenizer = AutoTokenizer.from_pretrained('t5-base')
			model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)
			#st.text("Using Google T5 Transformer ..")
			inputs = tokenizer.encode("summarize: " + text,
						return_tensors='pt',
										max_length=512,
										truncation=True)
			summary_ids = model.generate(inputs, max_length=150, min_length=80, length_penalty=5., num_beams=2)
			summary = tokenizer.decode(summary_ids[0])
			st.success(summary)
	
	st.sidebar.subheader("About App")
	st.sidebar.markdown("By [Soumen Sarker](https://soumen-sarker-personal-website.streamlitapp.com/)")

if __name__ == '__main__':
	main()