Spaces:

chinhon
/

frequent_word_counter

Runtime error

File size: 3,547 Bytes

import matplotlib as mpl
import matplotlib.pyplot as plt
import nltk
import re
import streamlit as st

from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from yellowbrick.text import FreqDistVisualizer

nltk.download('stopwords')
mpl.rcParams["figure.dpi"] = 300
plt.rcParams.update({'font.size': 25})
mpl.rcParams['xtick.labelsize']=18
mpl.rcParams['ytick.labelsize']=18

# must be called as first command
try:
    st.set_page_config(layout="wide")
except:
    st.beta_set_page_config(layout="wide")

#stop-words list
stop_words = stopwords.words("english")

news_stopwords = [
    "would",
    "also",
    "may",
    "even",
    "like",
    "way",
    "year",
    "years",
    "one",
    "many",
    "us",
    "view",
    "singapore",
    "mr",
    "mrs",
    "ms",
    "madam",
    "madame",
    "prosperity",
    "every",
    "day",
    "https",
    "www",
    "channelnewsasia",
    "read",
    "today",
    "todayonline",
    "com",
    "news",
    "say",
    "said",
    "add",
    "added"

]

stop_words.extend(news_stopwords)

#text-cleaning function
def clean_text(text):
    text = text.encode("ascii", errors="ignore").decode("ascii") #remove non-ascii, Chinese characters
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"\n", " ", text)
    text = re.sub(r"\n\n", " ", text)
    text = re.sub(r"\t", " ", text)
    text = re.sub(r"\W", " ", text)
    text = re.sub(r"[^\w\s]", "", text)
    text = re.sub(r"^\d+\s|\s\d+\s|\s\d+$", " ", text)
    text = text.strip(" ")
    text = re.sub(' +',' ', text).strip() # get rid of multiple spaces and replace with a single
    return text

st.title("Word Frequency Counter")

user_input = st.text_area(
        "Paste selected text here",
        height=250,
    )

ngram_select = st.slider("N-gram Range, or length of consecutive words", min_value=1, max_value=3, value=(1, 3))

st.markdown(
    """
----------------------------
"""
)
if user_input != "":
    with st.spinner("..."):

        input_text = clean_text(user_input)

        vect = CountVectorizer(
            stop_words=stop_words,
            min_df=1,
            ngram_range=(ngram_select[0], ngram_select[1]),
        )

        docs = vect.fit_transform([input_text])

        features = vect.get_feature_names()

        convo_title = " "

        fig, ax = plt.subplots(figsize=(20, 12))

        fig.suptitle(
            "10 Most Frequently Used Word(s)",
            fontsize=25,
        )

        ax.legend().set_visible(False)

        visualiser = FreqDistVisualizer(
            features=features, n=10, color="steelblue", title=convo_title
        )

        visualiser.fit(docs)

        for item in (
            [ax.title, ax.xaxis.label, ax.yaxis.label]
            + ax.get_xticklabels()
            + ax.get_yticklabels()
        ):
            item.set_fontsize(18)

        st.pyplot(fig)

st.markdown(
    """
----------------------------
"""
)
st.subheader("How To Use:")

st.write("1. This app works only for English text.")

st.write(
    "2a. By adjusting the N-gram range, you can discover the most frequently appearing single word, 2 or 3-word pairings (bigrams/trigrams)."
)

st.write(
    "2b. Examples: To find the most frequently appearing single word, move the yellow dot on '3' to '1'. For the most frequently appearing 2-word pairing, move the yellow dots on both ends to the centre, or the '2' position."
)

st.write(
    "3. Routine words and honorifics will be filtered out automatically in the results."
)