Spaces:

Soumen
/

Text-Summarization-and-NLP-tasks

Sleeping

App Files Files Community

Text-Summarization-and-NLP-tasks / app.py

Soumen

Update app.py

18124fd over 1 year ago

raw

history blame

No virus

5 kB

	"""
	## App: NLP App with Streamlit
	Credits: Streamlit Team,Marc Skov Madsen(For Awesome-streamlit gallery)
	Description
	This is a Natural Language Processing(NLP) Based App useful for basic NLP concepts such as follows;

	+ Tokenization & Lemmatization using Spacy

	+ Named Entity Recognition(NER) using SpaCy

	+ Sentiment Analysis using TextBlob

	+ Document/Text Summarization using Gensim/T5

	This is built with Streamlit Framework, an awesome framework for building ML and NLP tools.
	Purpose
	To perform basic and useful NLP task with Streamlit, Spacy, Textblob and Gensim
	"""
	# Core Pkgs
	import os
	os.system('sudo apt-get install tesseract-ocr-eng')
	os.system('sudo apt-get install tesseract-ocr-ben')
	#os.system('sudo apt update')
	os.system('wget https://github.com/tesseract-ocr/tessdata/raw/main/ben.traineddata')
	os.system('gunzip ben.traineddata.gz ')
	os.system('sudo mv -v ben.traineddata /usr/local/share/tessdata/')
	os.system('pip install -q pytesseract')
	import streamlit as st
	import os
	import torch
	from transformers import AutoTokenizer, AutoModelWithLMHead

	# NLP Pkgs
	from textblob import TextBlob
	import spacy
	from gensim.summarization import summarize
	import requests
	import cv2
	import numpy as np
	import pytesseract
	#pytesseract.pytesseract.tesseract_cmd = r"./Tesseract-OCR/tesseract.exe"
	from PIL import Image
	@st.cache
	def text_analyzer(my_text):
	nlp = spacy.load('en_core_web_sm')
	docx = nlp(my_text)
	# tokens = [ token.text for token in docx]
	allData = [('"Token":{},\n"Lemma":{}'.format(token.text,token.lemma_))for token in docx ]
	return allData

	# Function For Extracting Entities
	@st.cache
	def entity_analyzer(my_text):
	nlp = spacy.load('en_core_web_sm')
	docx = nlp(my_text)
	tokens = [ token.text for token in docx]
	entities = [(entity.text,entity.label_)for entity in docx.ents]
	allData = ['"Token":{},\n"Entities":{}'.format(tokens,entities)]
	return allData


	def main():
	""" NLP Based App with Streamlit """

	# Title
	st.title("Streamlit NLP APP")
	st.markdown("""
	#### Description
	+ This is a Natural Language Processing(NLP) Based App useful for basic NLP task
	NER,Sentiment, Spell Corrections and Summarization
	""")


	# Entity Extraction
	if st.checkbox("Show Named Entities"):
	st.subheader("Analyze Your Text")

	message = st.text_area("Enter your Text","Typing Here ..")
	if st.button("Extract"):
	entity_result = entity_analyzer(message)
	st.json(entity_result)

	# Sentiment Analysis
	elif st.checkbox("Show Sentiment Analysis"):
	st.subheader("Analyse Your Text")
	message = st.text_area("Enter Text plz","Type Here .")
	if st.button("Analyze"):
	blob = TextBlob(message)
	result_sentiment = blob.sentiment
	st.success(result_sentiment)
	#Text Corrections
	elif st.checkbox("Spell Corrections"):
	st.subheader("Correct Your Text")
	message = st.text_area("Enter the Text","Type please ..")
	if st.button("Spell Corrections"):
	st.text("Using TextBlob ..")
	st.success(TextBlob(message).correct())
	def change_photo_state():
	st.session_state["photo"]="done"
	st.subheader("Summary section, feed your image!")
	camera_photo = st.camera_input("Take a photo", on_change=change_photo_state)
	uploaded_photo = st.file_uploader("Upload Image",type=['jpg','png','jpeg'], on_change=change_photo_state)
	message = st.text_input("Or, drop your text here!")
	if "photo" not in st.session_state:
	st.session_state["photo"]="not done"

	if st.session_state["photo"]=="done" or message:
	if uploaded_photo:
	img = Image.open(uploaded_photo)
	img = img.save("img.png")
	img = cv2.imread("img.png")
	text = pytesseract.image_to_string(img, lang="ben")
	st.success(text)
	if camera_photo:
	img = Image.open(camera_photo)
	img = img.save("img.png")
	img = cv2.imread("img.png")
	text = pytesseract.image_to_string(img)
	st.success(text)
	if uploaded_photo==None and camera_photo==None:
	#our_image=load_image("image.jpg")
	#img = cv2.imread("scholarly_text.jpg")
	text = message
	# Summarization
	if st.checkbox("Show Text Summarization Genism"):
	st.subheader("Summarize Your Text")
	#message = st.text_area("Enter the Text","Type please ..")
	st.text("Using Gensim Summarizer ..")
	#st.success(mess)
	summary_result = summarize(text)
	st.success(summary_result)
	elif st.checkbox("Show Text Summarization T5"):
	st.subheader("Summarize Your Text")
	tokenizer = AutoTokenizer.from_pretrained('t5-base')
	model = AutoModelWithLMHead.from_pretrained('t5-base', return_dict=True)
	st.text("Using Google T5 Transformer ..")
	inputs = tokenizer.encode("summarize: " + text,
	return_tensors='pt',
	max_length=512,
	truncation=True)
	summary_ids = model.generate(inputs, max_length=150, min_length=80, length_penalty=5., num_beams=2)
	summary = tokenizer.decode(summary_ids[0])
	st.success(summary)

	st.sidebar.subheader("About App")
	st.sidebar.subheader("By")
	st.sidebar.text("Soumen Sarker")

	if __name__ == '__main__':
	main()