import streamlit as st import pandas as pd import spacy import matplotlib.pyplot as plt import seaborn as sns from bertopic import BERTopic from wordcloud import WordCloud import nltk import plotly.express as px import plotly.graph_objects as go import json import warnings from datetime import datetime nltk.download('stopwords') nlp = spacy.load("fr_core_news_sm") warnings.filterwarnings('ignore') from nltk import FreqDist df = pd.read_csv("gdiy_data.csv", sep=',', parse_dates=['release_date']) # use `release_date` as date in pandas def clean_data(df): ''' args : pd DataFrame Return : pd DataFrme''' df = df.drop('Unnamed: 0', axis=1) df['description'] = df['description'].str.lower() df = df.set_index('release_date') # Remove EXTRAIT and REDIFF in the podcasts df = df.loc[[not (df['name'][i].startswith(('[EXTRAIT]', '[REDIFF]'))) for i in range(len(df))]] df.loc[:, 'duration_min'] = df['duration_ms'].apply( lambda row: row / (60 * 1000)) # convert duration in minutes df['year'] = df.index.year df['month'] = df.index.month return df df_clean = clean_data(df) # Part of Speech to be remove : 'ADV' refers to adverb, 'ADJ' refers to Adjective pos = ['ADV', 'PRON', 'CCONJ', 'PUNCT', 'DET', 'ADP', 'SPACE', 'ADJ', 'VERB'] #list of part of speech to be removed # Some frequently used in the podcast context = ['ouais', 'épisode', 'faire', 'morgan','prudhomme', 'lire', 'génération','podcast', 'gdiy', 'recommande','deux','quand','the','livre', 'être','yourself', 'orso', 'doi', 'an', 'merci', 'avoir','timeline','face','million','monde', 'vie','and','fait','abonnez', 'parce', 'ouai', 'sai', 'it', 'do', 'mets', 'yourself','si', 'chose','oui', 'truc', 'dessus', 'traite', 'that'] # add some frequent words in stopword with open('./clean_docs.json', 'r') as f: clean_text = json.load(f) docs = clean_text['text'] #load the model topic_model = BERTopic.load("./model_dir/") timestamps = [datetime.strptime(date_time, "%d/%m/%Y") for date_time in clean_text["date"]] topics_over_time = topic_model.topics_over_time(docs, timestamps, global_tuning=True, evolution_tuning=True, nr_bins=20) #visualize topics over times time_fig = topic_model.visualize_topics_over_time(topics_over_time, top_n_topics=10) time_fig.update(layout_showlegend=False) time_fig.update_layout(autosize=False, width=800, height=400,) #group topics per year topics_over_time = topics_over_time[topics_over_time['Topic'] != -1] topics_over_time.set_index('Timestamp', inplace=True) topics_over_time['year'] = topics_over_time.index.year topic_per_year = topics_over_time.groupby(['year'])['Words'].apply(lambda x: x.str.cat(sep=' ')) # barchart of topics topic_fig = topic_model.visualize_barchart(n_words=10) topic_fig.update_layout(autosize=False, width=800) def wordscloud(text: str): ''' compute wordcloud of some strings Args : text is strings format Return : matplotlib figure''' WordCloud() word_cloud = WordCloud(background_color='white').generate(text) fig, ax = plt.subplots() ax.imshow(word_cloud, interpolation='bilinear') plt.axis("off") plt.show() st.pyplot(fig) data = df_clean.resample('Y')['duration_min'].mean() #average per of duration podcast_per_year = df_clean['year'].value_counts().reset_index() # count the number of podcasts per year podcast_per_year.rename(columns ={'index' :'year', 'year' : 'nb_podcast'}, inplace=True) #rename columns #visualize duration by the year fig = px.line(x=data.index.year, y=data, text=data.astype('int'), markers=True) fig.update_traces(textposition="bottom right") fig.update_layout(autosize=False, width=800) st.write(''' # Nous sommes la moyenne des personnes que nous fréquentons. Hello''') st.header('Durée moyenne des podcasts par année') st.plotly_chart(fig, use_container_width=False, sharing="streamlit") #word cloud of all terms st.header('Les mots fréquemment utilisés dans le podcast') #show topics st.header('Sujets évoqués dans le podcast') st.plotly_chart(topic_fig, use_container_width=False, sharing="streamlit") #show topics over years st.header('Sujets évoqués au cours du temps dans le podcast') st.plotly_chart(time_fig, use_container_width=False, sharing="streamlit") #Terms used in 2017 st.header('Sujets en 2O17') text = topic_per_year[2017].replace(',', "") wordscloud(text) #Terms used in 2018 st.header('Sujets en 2O18') text = topic_per_year[2018].replace(',', "") wordscloud(text) #Terms used in 2019 st.header('Sujets en 2O19') text = topic_per_year[2019].replace(',', "") wordscloud(text) #Terms used in 2020 st.header('Sujets en 2O20') text = topic_per_year[2020].replace(',', "") wordscloud(text) #Terms used in 2021 st.header('Sujets en 2O21') text = topic_per_year[2021].replace(',', "") wordscloud(text) #Terms used in 2022 st.header('Sujets en 2O22') text = topic_per_year[2022].replace(',', "") wordscloud(text)