import streamlit as st import pandas as pd import json import plotly.graph_objects as go from plotly.subplots import make_subplots # Set page config st.set_page_config(layout="wide", page_title="Macrocosmos HF Dataset Explorer", page_icon="🌌") # Custom CSS (keep existing styles) st.markdown(""" """, unsafe_allow_html=True) # Title and description st.title("🌌 Macrocosmos HF Dataset Explorer") st.markdown("Explore massive datasets hosted on Hugging Face, totaling approximately 100GB of data.") # Function to load dataset information @st.cache_data def load_datasets(): datasets = [ # Reddit datasets {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_69", "Number of rows": "6000000"}, {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/mgrtsv/reddit_dataset_229", "Number of rows": "44,815,182"}, {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_88", "Number of rows": "253,506,882"}, {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_218", "Number of rows": "562,042"}, {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_13", "Number of rows": "18,931,749"}, {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/chris241/reddit_dataset_219", "Number of rows": "227,599,340"}, {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/icedwind/reddit_dataset_112", "Number of rows": "301,588,714"}, {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/dataverse-scraping/reddit_dataset_71", "Number of rows": "259,924,884"}, {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_209", "Number of rows": "209,698,975"}, {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_218", "Number of rows": "7,064,613"}, {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/dataverse-scraping/reddit_dataset_192", "Number of rows": "249000000"}, {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/icedwind/reddit_dataset_226", "Number of rows": "303000000"}, {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_123", "Number of rows": "1120000"}, {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/chris241/reddit_dataset_75", "Number of rows": "132000000"}, {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_242", "Number of rows": "130000000"}, {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/mgrtsv/reddit_dataset_231", "Number of rows": "31200000"}, {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_9", "Number of rows": "26900000"}, # X datasets {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/littleGuagua/x_dataset_0", "Number of rows": "331,611,777"}, {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/suul999922/x_dataset_71", "Number of rows": "8,998,828"}, {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/thayallans/x_dataset_28", "Number of rows": "178,669"}, {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/apidojo/x_dataset_242", "Number of rows": "499,067"}, {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/icedwind/x_dataset_112", "Number of rows": "331,500,777"}, {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/x_dataset_218", "Number of rows": "1,753,878"}, {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/SAVE0x0/x_dataset_191", "Number of rows": "92,588"}, {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/johnny8188/x_dataset_187", "Number of rows": "52,762"}, {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/icedwind/x_dataset_19", "Number of rows": "332000000"}, {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/wenknow/x_dataset", "Number of rows": "9900"}, {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_123", "Number of rows": "89000"} ] return datasets # Function to convert row count to float def parse_row_count(row_count): return float(row_count.replace(',', '')) # Load datasets datasets = load_datasets() df = pd.DataFrame(datasets) # Calculate total rows total_rows = sum(parse_row_count(rows) for rows in df['Number of rows']) # Display statistics col1, col2 = st.columns(2) with col1: st.metric("Total Rows", f"{total_rows / 1e9:.2f}B") with col2: st.metric("Total Datasets", len(df)) # Display the dataset table st.subheader("Dataset Overview") st.dataframe( df, column_config={ "Source": st.column_config.TextColumn("Source"), "DataSet repo link": st.column_config.LinkColumn("Repository"), "Number of rows": st.column_config.TextColumn("Rows"), }, hide_index=True, use_container_width=True ) # Call-to-action section with styled button st.markdown("""