Spaces:

chinhon
/

frequent_word_counter

Runtime error

App Files Files Community

frequent_word_counter / app.py

chinhon

Update app.py

6023dde over 2 years ago

raw history blame contribute delete

No virus

3.55 kB

	import matplotlib as mpl
	import matplotlib.pyplot as plt
	import nltk
	import re
	import streamlit as st

	from nltk.corpus import stopwords
	from sklearn.feature_extraction.text import CountVectorizer
	from yellowbrick.text import FreqDistVisualizer

	nltk.download('stopwords')
	mpl.rcParams["figure.dpi"] = 300
	plt.rcParams.update({'font.size': 25})
	mpl.rcParams['xtick.labelsize']=18
	mpl.rcParams['ytick.labelsize']=18

	# must be called as first command
	try:
	st.set_page_config(layout="wide")
	except:
	st.beta_set_page_config(layout="wide")

	#stop-words list
	stop_words = stopwords.words("english")

	news_stopwords = [
	"would",
	"also",
	"may",
	"even",
	"like",
	"way",
	"year",
	"years",
	"one",
	"many",
	"us",
	"view",
	"singapore",
	"mr",
	"mrs",
	"ms",
	"madam",
	"madame",
	"prosperity",
	"every",
	"day",
	"https",
	"www",
	"channelnewsasia",
	"read",
	"today",
	"todayonline",
	"com",
	"news",
	"say",
	"said",
	"add",
	"added"

	]

	stop_words.extend(news_stopwords)

	#text-cleaning function
	def clean_text(text):
	text = text.encode("ascii", errors="ignore").decode("ascii") #remove non-ascii, Chinese characters
	text = text.lower()
	text = re.sub(r"http\S+", "", text)
	text = re.sub(r"\n", " ", text)
	text = re.sub(r"\n\n", " ", text)
	text = re.sub(r"\t", " ", text)
	text = re.sub(r"\W", " ", text)
	text = re.sub(r"[^\w\s]", "", text)
	text = re.sub(r"^\d+\s\|\s\d+\s\|\s\d+$", " ", text)
	text = text.strip(" ")
	text = re.sub(' +',' ', text).strip() # get rid of multiple spaces and replace with a single
	return text

	st.title("Word Frequency Counter")

	user_input = st.text_area(
	"Paste selected text here",
	height=250,
	)

	ngram_select = st.slider("N-gram Range, or length of consecutive words", min_value=1, max_value=3, value=(1, 3))

	st.markdown(
	"""
	----------------------------
	"""
	)
	if user_input != "":
	with st.spinner("..."):

	input_text = clean_text(user_input)

	vect = CountVectorizer(
	stop_words=stop_words,
	min_df=1,
	ngram_range=(ngram_select[0], ngram_select[1]),
	)

	docs = vect.fit_transform([input_text])

	features = vect.get_feature_names()

	convo_title = " "

	fig, ax = plt.subplots(figsize=(20, 12))

	fig.suptitle(
	"10 Most Frequently Used Word(s)",
	fontsize=25,
	)

	ax.legend().set_visible(False)

	visualiser = FreqDistVisualizer(
	features=features, n=10, color="steelblue", title=convo_title
	)

	visualiser.fit(docs)

	for item in (
	[ax.title, ax.xaxis.label, ax.yaxis.label]
	+ ax.get_xticklabels()
	+ ax.get_yticklabels()
	):
	item.set_fontsize(18)

	st.pyplot(fig)

	st.markdown(
	"""
	----------------------------
	"""
	)
	st.subheader("How To Use:")

	st.write("1. This app works only for English text.")

	st.write(
	"2a. By adjusting the N-gram range, you can discover the most frequently appearing single word, 2 or 3-word pairings (bigrams/trigrams)."
	)

	st.write(
	"2b. Examples: To find the most frequently appearing single word, move the yellow dot on '3' to '1'. For the most frequently appearing 2-word pairing, move the yellow dots on both ends to the centre, or the '2' position."
	)

	st.write(
	"3. Routine words and honorifics will be filtered out automatically in the results."
	)