from datasets import load_dataset import streamlit as st import pandas as pd import re import nltk from wordcloud import WordCloud, STOPWORDS from nltk.corpus import stopwords nltk.download("stopwords") stop = stopwords.words('english') dataset = load_dataset("huggingartists/gorillaz") df = pd.DataFrame.from_dict(dataset["train"]) st.dataframe(df) st.write("Removed special characters") def standardize(text, remove_digits=True): text=re.sub('[^a-zA-Z\d\s]', '',text) text = text.lower() return text df.text = df.text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)])) df.text=df.text.apply(standardize) st.dataframe(df) words = df.text.str.split(expand=True).unstack().value_counts() st.bar_chart(words[20:40]) st.set_option('deprecation.showPyplotGlobalUse', False) import matplotlib.pyplot as plt def word_cloud(content, title): wc = WordCloud(background_color='white', max_words=200, stopwords=STOPWORDS, max_font_size=50) wc.generate(" ".join(content.index.values)) fig = plt.figure(figsize=(16, 13)) plt.title(title, fontsize=20) plt.imshow(wc.recolor(colormap='Pastel2', random_state=42), alpha=0.98) plt.axis('off') st.pyplot() word_cloud(words, "Word Cloud")