Spaces:

Soumen
/

Text-Summarization-and-NLP-tasks

Running

App Files Files Community

Text-Summarization-and-NLP-tasks / app.py

Soumen

Update app.py

3f6c2be about 2 years ago

raw

history blame

6.9 kB

	"""
	#App: NLP App with Streamlit
	Credits: Streamlit Team, Marc Skov Madsen(For Awesome-streamlit gallery)
	Description
	This is a Natural Language Processing(NLP) base Application that is useful for basic NLP tasks such as follows;

	+ Tokenization(POS tagging) & Lemmatization(root mean) using Spacy

	+ Named Entity Recognition(NER)/Trigger word detection using SpaCy

	+ Sentiment Analysis using TextBlob

	+ Document/Text Summarization using Gensim/T5 both for Bangla Extractive and English Abstructive.

	This is built with Streamlit Framework, an awesome framework for building ML and NLP tools.
	Purpose
	To perform basic and useful NLP tasks with Streamlit, Spacy, Textblob, and Gensim
	"""
	# Core Pkgs
	import os
	#os.system('sudo apt-get install tesseract-ocr-eng')
	#os.system('sudo apt-get install tesseract-ocr-ben')

	#os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
	#os.system('gunzip ben.traineddata.gz ')
	#os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
	#os.system('pip install -q pytesseract')
	import streamlit as st
	import torch
	from transformers import AutoTokenizer, AutoModelWithLMHead, GPT2LMHeadModel
	import docx2txt
	from PIL import Image
	from PyPDF2 import PdfFileReader
	import pdfplumber

	# NLP Pkgs
	from textblob import TextBlob
	import spacy
	from gensim.summarization import summarize
	import requests
	import cv2
	import numpy as np
	import pytesseract
	#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
	from PIL import Image
	def read_pdf(file):
	pdfReader = PdfFileReader(file)
	count = pdfReader.numPages
	all_page_text = ""
	for i in range(count):
	page = pdfReader.getPage(i)
	all_page_text += page.extractText()

	return all_page_text

	#def read_pdf_with_pdfplumber(file):
	# with pdfplumber.open(file) as pdf:
	# page = pdf.pages[0]
	# return page.extract_text()


	# Title
	if st.button("REFRESH"):
	st.experimental_rerun()

	st.title("Streamlit NLP APP")
	@st.experimental_singleton
	def text_analyzer(my_text):
	nlp = spacy.load('en_core_web_sm')
	docx = nlp(my_text)
	# tokens = [ token.text for token in docx]
	allData = [('"Token":{},\n"Lemma":{}'.format(token.text,token.lemma_))for token in docx ]
	return allData
	@st.experimental_singleton
	def load_models():
	tokenizer = AutoTokenizer.from_pretrained('gpt2-large')
	model = GPT2LMHeadModel.from_pretrained('gpt2-large')
	return tokenizer, model
	# Function For Extracting Entities
	@st.experimental_singleton
	def entity_analyzer(my_text):
	nlp = spacy.load('en_core_web_sm')
	docx = nlp(my_text)
	tokens = [ token.text for token in docx]
	entities = [(entity.text,entity.label_)for entity in docx.ents]
	allData = ['"Token":{},\n"Entities":{}'.format(tokens,entities)]
	return allData
	def main():
	""" NLP Based Application with Streamlit """
	st.markdown("""
	#### Description
	##This is a Natural Language Processing(NLP) base Application that is useful for basic NLP tasks such as follows:
	+ Tokenization(POS tagging) & Lemmatization(root mean) using Spacy
	+ Named Entity Recognition(NER)/Trigger word detection using SpaCy
	+ Sentiment Analysis using TextBlob
	+ Document/Text Summarization using Gensim/T5 both for Bangla Extractive and English Abstractive.
	""")
	def change_photo_state():
	st.session_state["photo"]="done"
	st.subheader("Please, feed your image/text, features/services will appear automatically!")
	message = st.text_input("Type your text here!")
	camera_photo = st.camera_input("Take a photo, Containing English or Bangla texts", on_change=change_photo_state)
	uploaded_photo = st.file_uploader("Upload Image/PDF, Containing English or Bangla texts",type=['jpg','png','jpeg','pdf'], on_change=change_photo_state)
	if "photo" not in st.session_state:
	st.session_state["photo"]="not done"
	if st.session_state["photo"]=="done" or message:
	if st.button("Process_PDF"):
	text = read_pdf(uploaded_photo)
	#text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
	st.success(text)
	elif uploaded_photo and uploaded_photo.type != "application/pdf":
	img = Image.open(uploaded_photo)
	img = img.save("img.png")
	img = cv2.imread("img.png")
	text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
	st.success(text)
	elif camera_photo:
	img = Image.open(camera_photo)
	img = img.save("img.png")
	img = cv2.imread("img.png")
	text = pytesseract.image_to_string(img, lang="ben") if st.checkbox("Mark to see Bangla Image's Text") else pytesseract.image_to_string(img)
	st.success(text)
	elif uploaded_photo==None and camera_photo==None:
	#our_image=load_image("image.jpg")
	#img = cv2.imread("scholarly_text.jpg")
	text = message
	if st.checkbox("Show Named Entities English/Bangla"):
	entity_result = entity_analyzer(text)
	st.json(entity_result)
	if st.checkbox("Show Sentiment Analysis for English"):
	blob = TextBlob(text)
	result_sentiment = blob.sentiment
	st.success(result_sentiment)
	if st.checkbox("Spell Corrections for English"):
	st.success(TextBlob(text).correct())
	if st.checkbox("Text Generation"):
	ok = st.button("Generate")
	if ok:
	tokenizer, model = load_models()
	input_ids = tokenizer(text, return_tensors='pt').input_ids
	st.text("Using Hugging Face Transformer, Contrastive Search ..")
	output = model.generate(input_ids, max_length=128)
	st.success(tokenizer.decode(output[0], skip_special_tokens=True))
	if st.checkbox("Mark here, Text Summarization for English or Bangla!"):
	#st.subheader("Summarize Your Text for English and Bangla Texts!")
	#message = st.text_area("Enter the Text","Type please ..")
	#st.text("Using Gensim Summarizer ..")
	#st.success(mess)
	summary_result = summarize(text)
	st.success(summary_result)
	if st.checkbox("Mark to better English Text Summarization!"):
	#st.title("Summarize Your Text for English only!")
	tokenizer = AutoTokenizer.from_pretrained('t5-base')
	model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)
	#st.text("Using Google T5 Transformer ..")
	inputs = tokenizer.encode("summarize: " + text,
	return_tensors='pt',
	max_length=512,
	truncation=True)
	summary_ids = model.generate(inputs, max_length=150, min_length=80, length_penalty=5., num_beams=2)
	summary = tokenizer.decode(summary_ids[0])
	st.success(summary)

	st.sidebar.subheader("About App")
	st.sidebar.markdown("By [Soumen Sarker](https://soumen-sarker-personal-website.streamlitapp.com/)")

	if __name__ == '__main__':
	main()