import matplotlib as mpl import matplotlib.pyplot as plt import nltk import re import streamlit as st from nltk.corpus import stopwords from sklearn.feature_extraction.text import CountVectorizer from yellowbrick.text import FreqDistVisualizer nltk.download('stopwords') mpl.rcParams["figure.dpi"] = 300 plt.rcParams.update({'font.size': 25}) mpl.rcParams['xtick.labelsize']=18 mpl.rcParams['ytick.labelsize']=18 # must be called as first command try: st.set_page_config(layout="wide") except: st.beta_set_page_config(layout="wide") #stop-words list stop_words = stopwords.words("english") news_stopwords = [ "would", "also", "may", "even", "like", "way", "year", "years", "one", "many", "us", "view", "singapore", "mr", "mrs", "ms", "madam", "madame", "prosperity", "every", "day", "https", "www", "channelnewsasia", "read", "today", "todayonline", "com", "news", "say", "said", "add", "added" ] stop_words.extend(news_stopwords) #text-cleaning function def clean_text(text): text = text.encode("ascii", errors="ignore").decode("ascii") #remove non-ascii, Chinese characters text = text.lower() text = re.sub(r"http\S+", "", text) text = re.sub(r"\n", " ", text) text = re.sub(r"\n\n", " ", text) text = re.sub(r"\t", " ", text) text = re.sub(r"\W", " ", text) text = re.sub(r"[^\w\s]", "", text) text = re.sub(r"^\d+\s|\s\d+\s|\s\d+$", " ", text) text = text.strip(" ") text = re.sub(' +',' ', text).strip() # get rid of multiple spaces and replace with a single return text st.title("Word Frequency Counter") user_input = st.text_area( "Paste selected text here", height=250, ) ngram_select = st.slider("N-gram Range, or length of consecutive words", min_value=1, max_value=3, value=(1, 3)) st.markdown( """ ---------------------------- """ ) if user_input != "": with st.spinner("..."): input_text = clean_text(user_input) vect = CountVectorizer( stop_words=stop_words, min_df=1, ngram_range=(ngram_select[0], ngram_select[1]), ) docs = vect.fit_transform([input_text]) features = vect.get_feature_names() convo_title = " " fig, ax = plt.subplots(figsize=(20, 12)) fig.suptitle( "10 Most Frequently Used Word(s)", fontsize=25, ) ax.legend().set_visible(False) visualiser = FreqDistVisualizer( features=features, n=10, color="steelblue", title=convo_title ) visualiser.fit(docs) for item in ( [ax.title, ax.xaxis.label, ax.yaxis.label] + ax.get_xticklabels() + ax.get_yticklabels() ): item.set_fontsize(18) st.pyplot(fig) st.markdown( """ ---------------------------- """ ) st.subheader("How To Use:") st.write("1. This app works only for English text.") st.write( "2a. By adjusting the N-gram range, you can discover the most frequently appearing single word, 2 or 3-word pairings (bigrams/trigrams)." ) st.write( "2b. Examples: To find the most frequently appearing single word, move the yellow dot on '3' to '1'. For the most frequently appearing 2-word pairing, move the yellow dots on both ends to the centre, or the '2' position." ) st.write( "3. Routine words and honorifics will be filtered out automatically in the results." )