Spaces:
Runtime error
Runtime error
import matplotlib as mpl | |
import matplotlib.pyplot as plt | |
import nltk | |
import re | |
import streamlit as st | |
from nltk.corpus import stopwords | |
from sklearn.feature_extraction.text import CountVectorizer | |
from yellowbrick.text import FreqDistVisualizer | |
nltk.download('stopwords') | |
mpl.rcParams["figure.dpi"] = 300 | |
plt.rcParams.update({'font.size': 25}) | |
mpl.rcParams['xtick.labelsize']=18 | |
mpl.rcParams['ytick.labelsize']=18 | |
# must be called as first command | |
try: | |
st.set_page_config(layout="wide") | |
except: | |
st.beta_set_page_config(layout="wide") | |
#stop-words list | |
stop_words = stopwords.words("english") | |
news_stopwords = [ | |
"would", | |
"also", | |
"may", | |
"even", | |
"like", | |
"way", | |
"year", | |
"years", | |
"one", | |
"many", | |
"us", | |
"view", | |
"singapore", | |
"mr", | |
"mrs", | |
"ms", | |
"madam", | |
"madame", | |
"prosperity", | |
"every", | |
"day", | |
"https", | |
"www", | |
"channelnewsasia", | |
"read", | |
"today", | |
"todayonline", | |
"com", | |
"news", | |
"say", | |
"said", | |
"add", | |
"added" | |
] | |
stop_words.extend(news_stopwords) | |
#text-cleaning function | |
def clean_text(text): | |
text = text.encode("ascii", errors="ignore").decode("ascii") #remove non-ascii, Chinese characters | |
text = text.lower() | |
text = re.sub(r"http\S+", "", text) | |
text = re.sub(r"\n", " ", text) | |
text = re.sub(r"\n\n", " ", text) | |
text = re.sub(r"\t", " ", text) | |
text = re.sub(r"\W", " ", text) | |
text = re.sub(r"[^\w\s]", "", text) | |
text = re.sub(r"^\d+\s|\s\d+\s|\s\d+$", " ", text) | |
text = text.strip(" ") | |
text = re.sub(' +',' ', text).strip() # get rid of multiple spaces and replace with a single | |
return text | |
st.title("Word Frequency Counter") | |
user_input = st.text_area( | |
"Paste selected text here", | |
height=250, | |
) | |
ngram_select = st.slider("N-gram Range, or length of consecutive words", min_value=1, max_value=3, value=(1, 3)) | |
st.markdown( | |
""" | |
---------------------------- | |
""" | |
) | |
if user_input != "": | |
with st.spinner("..."): | |
input_text = clean_text(user_input) | |
vect = CountVectorizer( | |
stop_words=stop_words, | |
min_df=1, | |
ngram_range=(ngram_select[0], ngram_select[1]), | |
) | |
docs = vect.fit_transform([input_text]) | |
features = vect.get_feature_names() | |
convo_title = " " | |
fig, ax = plt.subplots(figsize=(20, 12)) | |
fig.suptitle( | |
"10 Most Frequently Used Word(s)", | |
fontsize=25, | |
) | |
ax.legend().set_visible(False) | |
visualiser = FreqDistVisualizer( | |
features=features, n=10, color="steelblue", title=convo_title | |
) | |
visualiser.fit(docs) | |
for item in ( | |
[ax.title, ax.xaxis.label, ax.yaxis.label] | |
+ ax.get_xticklabels() | |
+ ax.get_yticklabels() | |
): | |
item.set_fontsize(18) | |
st.pyplot(fig) | |
st.markdown( | |
""" | |
---------------------------- | |
""" | |
) | |
st.subheader("How To Use:") | |
st.write("1. This app works only for English text.") | |
st.write( | |
"2a. By adjusting the N-gram range, you can discover the most frequently appearing single word, 2 or 3-word pairings (bigrams/trigrams)." | |
) | |
st.write( | |
"2b. Examples: To find the most frequently appearing single word, move the yellow dot on '3' to '1'. For the most frequently appearing 2-word pairing, move the yellow dots on both ends to the centre, or the '2' position." | |
) | |
st.write( | |
"3. Routine words and honorifics will be filtered out automatically in the results." | |
) |