chinhon's picture
Update app.py
6023dde
import matplotlib as mpl
import matplotlib.pyplot as plt
import nltk
import re
import streamlit as st
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from yellowbrick.text import FreqDistVisualizer
nltk.download('stopwords')
mpl.rcParams["figure.dpi"] = 300
plt.rcParams.update({'font.size': 25})
mpl.rcParams['xtick.labelsize']=18
mpl.rcParams['ytick.labelsize']=18
# must be called as first command
try:
st.set_page_config(layout="wide")
except:
st.beta_set_page_config(layout="wide")
#stop-words list
stop_words = stopwords.words("english")
news_stopwords = [
"would",
"also",
"may",
"even",
"like",
"way",
"year",
"years",
"one",
"many",
"us",
"view",
"singapore",
"mr",
"mrs",
"ms",
"madam",
"madame",
"prosperity",
"every",
"day",
"https",
"www",
"channelnewsasia",
"read",
"today",
"todayonline",
"com",
"news",
"say",
"said",
"add",
"added"
]
stop_words.extend(news_stopwords)
#text-cleaning function
def clean_text(text):
text = text.encode("ascii", errors="ignore").decode("ascii") #remove non-ascii, Chinese characters
text = text.lower()
text = re.sub(r"http\S+", "", text)
text = re.sub(r"\n", " ", text)
text = re.sub(r"\n\n", " ", text)
text = re.sub(r"\t", " ", text)
text = re.sub(r"\W", " ", text)
text = re.sub(r"[^\w\s]", "", text)
text = re.sub(r"^\d+\s|\s\d+\s|\s\d+$", " ", text)
text = text.strip(" ")
text = re.sub(' +',' ', text).strip() # get rid of multiple spaces and replace with a single
return text
st.title("Word Frequency Counter")
user_input = st.text_area(
"Paste selected text here",
height=250,
)
ngram_select = st.slider("N-gram Range, or length of consecutive words", min_value=1, max_value=3, value=(1, 3))
st.markdown(
"""
----------------------------
"""
)
if user_input != "":
with st.spinner("..."):
input_text = clean_text(user_input)
vect = CountVectorizer(
stop_words=stop_words,
min_df=1,
ngram_range=(ngram_select[0], ngram_select[1]),
)
docs = vect.fit_transform([input_text])
features = vect.get_feature_names()
convo_title = " "
fig, ax = plt.subplots(figsize=(20, 12))
fig.suptitle(
"10 Most Frequently Used Word(s)",
fontsize=25,
)
ax.legend().set_visible(False)
visualiser = FreqDistVisualizer(
features=features, n=10, color="steelblue", title=convo_title
)
visualiser.fit(docs)
for item in (
[ax.title, ax.xaxis.label, ax.yaxis.label]
+ ax.get_xticklabels()
+ ax.get_yticklabels()
):
item.set_fontsize(18)
st.pyplot(fig)
st.markdown(
"""
----------------------------
"""
)
st.subheader("How To Use:")
st.write("1. This app works only for English text.")
st.write(
"2a. By adjusting the N-gram range, you can discover the most frequently appearing single word, 2 or 3-word pairings (bigrams/trigrams)."
)
st.write(
"2b. Examples: To find the most frequently appearing single word, move the yellow dot on '3' to '1'. For the most frequently appearing 2-word pairing, move the yellow dots on both ends to the centre, or the '2' position."
)
st.write(
"3. Routine words and honorifics will be filtered out automatically in the results."
)