Spaces:
Running
Running
import streamlit as st | |
import pandas as pd | |
import json | |
import plotly.graph_objects as go | |
from plotly.subplots import make_subplots | |
# Set page config | |
st.set_page_config(layout="wide", page_title="Macrocosmos HF Dataset Explorer", page_icon="π") | |
# Custom CSS (keep existing styles) | |
st.markdown(""" | |
<style> | |
.stApp { | |
max-width: 1200px; | |
margin: 0 auto; | |
font-family: 'Helvetica Neue', Arial, sans-serif; | |
} | |
.cta-container { | |
background-color: #f0f8ff; | |
border-radius: 10px; | |
padding: 20px; | |
margin-top: 30px; | |
margin-bottom: 30px; | |
border: 2px solid #1e90ff; | |
text-align: center; | |
} | |
.cta-title { | |
color: #1e90ff; | |
font-size: 24px; | |
font-weight: bold; | |
margin-bottom: 10px; | |
} | |
.cta-description { | |
color: #333; | |
font-size: 16px; | |
margin-bottom: 20px; | |
} | |
.stButton > button { | |
background-color: #1e90ff; | |
color: white; | |
font-size: 18px; | |
font-weight: bold; | |
padding: 10px 24px; | |
border-radius: 5px; | |
border: none; | |
transition: all 0.3s ease; | |
} | |
.stButton > button:hover { | |
background-color: #0066cc; | |
box-shadow: 0 4px 8px rgba(0, 0, 0, 0.1); | |
} | |
</style> | |
""", unsafe_allow_html=True) | |
# Title and description | |
st.title("π Macrocosmos HF Dataset Explorer") | |
st.markdown("Explore massive datasets hosted on Hugging Face, totaling approximately 100GB of data.") | |
# Function to load dataset information | |
def load_datasets(): | |
datasets = [ | |
# Reddit datasets | |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_69", "Number of rows": "6000000"}, | |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/mgrtsv/reddit_dataset_229", "Number of rows": "44,815,182"}, | |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_88", "Number of rows": "253,506,882"}, | |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_218", "Number of rows": "562,042"}, | |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_13", "Number of rows": "18,931,749"}, | |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/chris241/reddit_dataset_219", "Number of rows": "227,599,340"}, | |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/icedwind/reddit_dataset_112", "Number of rows": "301,588,714"}, | |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/dataverse-scraping/reddit_dataset_71", "Number of rows": "259,924,884"}, | |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_209", "Number of rows": "209,698,975"}, | |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_218", "Number of rows": "7,064,613"}, | |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/dataverse-scraping/reddit_dataset_192", "Number of rows": "249000000"}, | |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/icedwind/reddit_dataset_226", "Number of rows": "303000000"}, | |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_123", "Number of rows": "1120000"}, | |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/chris241/reddit_dataset_75", "Number of rows": "132000000"}, | |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_242", "Number of rows": "130000000"}, | |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/mgrtsv/reddit_dataset_231", "Number of rows": "31200000"}, | |
{"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_9", "Number of rows": "26900000"}, | |
# X datasets | |
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/littleGuagua/x_dataset_0", "Number of rows": "331,611,777"}, | |
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/suul999922/x_dataset_71", "Number of rows": "8,998,828"}, | |
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/thayallans/x_dataset_28", "Number of rows": "178,669"}, | |
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/apidojo/x_dataset_242", "Number of rows": "499,067"}, | |
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/icedwind/x_dataset_112", "Number of rows": "331,500,777"}, | |
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/x_dataset_218", "Number of rows": "1,753,878"}, | |
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/SAVE0x0/x_dataset_191", "Number of rows": "92,588"}, | |
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/johnny8188/x_dataset_187", "Number of rows": "52,762"}, | |
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/icedwind/x_dataset_19", "Number of rows": "332000000"}, | |
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/wenknow/x_dataset", "Number of rows": "9900"}, | |
{"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_123", "Number of rows": "89000"} | |
] | |
return datasets | |
# Function to convert row count to float | |
def parse_row_count(row_count): | |
return float(row_count.replace(',', '')) | |
# Load datasets | |
datasets = load_datasets() | |
df = pd.DataFrame(datasets) | |
# Calculate total rows | |
total_rows = sum(parse_row_count(rows) for rows in df['Number of rows']) | |
# Display statistics | |
col1, col2 = st.columns(2) | |
with col1: | |
st.metric("Total Rows", f"{total_rows / 1e9:.2f}B") | |
with col2: | |
st.metric("Total Datasets", len(df)) | |
# Display the dataset table | |
st.subheader("Dataset Overview") | |
st.dataframe( | |
df, | |
column_config={ | |
"Source": st.column_config.TextColumn("Source"), | |
"DataSet repo link": st.column_config.LinkColumn("Repository"), | |
"Number of rows": st.column_config.TextColumn("Rows"), | |
}, | |
hide_index=True, | |
use_container_width=True | |
) | |
# Call-to-action section with styled button | |
st.markdown(""" | |
<div class="cta-container"> | |
<div class="cta-title">π Explore Dataset Insights</div> | |
<div class="cta-description"> | |
Dive deep into the rich analytics of our dataset. Uncover trends, distributions, and key metrics that will enhance your understanding and guide your research. | |
</div> | |
</div> | |
""", unsafe_allow_html=True) | |
# Centered button | |
col1, col2, col3 = st.columns([1,2,1]) | |
with col2: | |
show_analysis = st.button("Reveal Dataset Analysis", use_container_width=True) | |
# Display dataset analysis if the button was clicked | |
if show_analysis: | |
# Load analysis results | |
def load_analysis_results(): | |
with open('analysis_results.json', 'r') as f: | |
return json.load(f) | |
analysis_results = load_analysis_results() | |
st.subheader("Analysis of a Sample Reddit Dataset") | |
st.write("This analysis is based on a sample from one of the Reddit datasets.") | |
# Display Dataset Structure | |
st.subheader("Dataset Structure") | |
structure = analysis_results['structure'] | |
col1, col2, col3, col4 = st.columns(4) | |
col1.metric("Total Partitions", structure['total_partitions']) | |
col2.metric("Total Rows", f"{structure['total_rows']:,}") | |
col3.metric("Number of Columns", len(structure['columns'])) | |
col4.metric("Date Range", f"{structure['date_range'][0]} to {structure['date_range'][1]}") | |
with st.expander("Show Columns"): | |
st.write(", ".join(structure['columns'])) | |
# Display Top Communities | |
st.subheader("Top Communities") | |
communities_df = pd.DataFrame(analysis_results['communities']) | |
fig = go.Figure(data=[go.Bar( | |
x=communities_df['communityName'], | |
y=communities_df['count'], | |
text=communities_df['percentage'].apply(lambda x: f'{x:.2%}'), | |
textposition='auto', | |
marker_color='#1e88e5' | |
)]) | |
fig.update_layout(title_text='Top Communities Distribution') | |
fig.update_traces(marker_line_color='rgb(8,48,107)', marker_line_width=1.5, opacity=0.6) | |
st.plotly_chart(fig, use_container_width=True) | |
# Display Time Distribution | |
st.subheader("Time Distribution") | |
time_df = pd.DataFrame(analysis_results['time_distribution']) | |
time_df['date'] = pd.to_datetime(time_df['date']) | |
fig = go.Figure(data=[go.Scatter(x=time_df['date'], y=time_df['count'], mode='lines+markers')]) | |
fig.update_layout(title_text='Posts Over Time') | |
st.plotly_chart(fig, use_container_width=True) | |
# Display Sentiment Distribution | |
st.subheader("Sentiment Distribution") | |
sentiment_df = pd.DataFrame(analysis_results['sentiment_distribution']) | |
fig = go.Figure(data=[go.Pie(labels=sentiment_df['sentiment'], values=sentiment_df['count'], textinfo='percent+label')]) | |
fig.update_layout(title_text='Sentiment Distribution') | |
fig.update_traces(marker=dict(colors=['#4CAF50', '#FFC107', '#F44336'])) | |
st.plotly_chart(fig, use_container_width=True) | |
# Display Data Type Distribution | |
st.subheader("Data Type Distribution") | |
data_type_df = pd.DataFrame(analysis_results['data_type_distribution']) | |
fig = go.Figure(data=[go.Pie(labels=data_type_df['dataType'], values=data_type_df['count'], textinfo='percent+label')]) | |
fig.update_layout(title_text='Data Type Distribution') | |
fig.update_traces(marker=dict(colors=['#2196F3', '#FF9800'])) | |
st.plotly_chart(fig, use_container_width=True) | |
# Display Top Topics | |
st.subheader("Top Topics") | |
topics_df = pd.DataFrame(analysis_results['top_topics']) | |
st.dataframe(topics_df, use_container_width=True) | |
# Display Average Text Length | |
st.metric("Average Text Length", f"{analysis_results['avg_text_length']:.2f} characters") | |
# Add instructions for using the datasets | |
st.subheader("How to Use These Datasets") | |
code = ''' | |
from datasets import load_dataset | |
dataset = load_dataset("username/dataset_name") | |
''' | |
st.code(code, language='python') | |
st.markdown(""" | |
1. Click on the dataset link to visit its Hugging Face page. | |
2. On the dataset page, you'll find information about the dataset's content, structure, and usage. | |
3. Use the code above to load a dataset, replacing `"username/dataset_name"` with the actual dataset identifier. | |
4. For these large datasets, consider using streaming or loading specific subsets to manage memory usage. | |
5. Always check the dataset's license and usage restrictions before incorporating it into your project. | |
""") | |
# Footer | |
st.markdown("---") | |
st.markdown("Created by Macrocosmos with β€οΈ") |