vtrubamacrocosmos commited on
Commit
b020462
1 Parent(s): c3df010

inittial commit

Browse files
Files changed (3) hide show
  1. app.py +102 -0
  2. macrocosmos-black.png +0 -0
  3. requirements.txt +3 -0
app.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+
4
+ # Set page config
5
+ st.set_page_config(layout="wide", page_title="Macrocosmos HF Dataset Explorer")
6
+
7
+ # Custom CSS
8
+ st.markdown("""
9
+ <style>
10
+ .stApp {
11
+ max-width: 1200px;
12
+ margin: 0 auto;
13
+ }
14
+ .st-emotion-cache-1r6slb0 {
15
+ background-color: #f0f2f6;
16
+ border-radius: 10px;
17
+ padding: 20px;
18
+ margin-bottom: 20px;
19
+ }
20
+ .st-emotion-cache-1wivap2 {
21
+ background-color: #ffffff;
22
+ border-radius: 10px;
23
+ padding: 20px;
24
+ box-shadow: 0 4px 6px rgba(0, 0, 0, 0.1);
25
+ }
26
+ .company-logo {
27
+ max-width: 200px;
28
+ margin-bottom: 20px;
29
+ }
30
+ </style>
31
+ """, unsafe_allow_html=True)
32
+
33
+ # Company logo
34
+ st.image("macrocosmos-black.png", use_column_width=False, width=200)
35
+
36
+ # Title and description
37
+ st.title("🤗 Hugging Face Large Dataset Explorer")
38
+ st.markdown("Explore massive datasets hosted on Hugging Face, totaling approximately 100GB of data.")
39
+
40
+ # Function to load dataset information
41
+ def load_datasets():
42
+ return [
43
+ {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/icedwind/x_dataset_19", "Number of rows": "332 MLN"},
44
+ {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/wenknow/x_dataset", "Number of rows": "9.9 K"},
45
+ {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_123", "Number of rows": "89 K"},
46
+ {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/dataverse-scraping/reddit_dataset_192", "Number of rows": "249 MLN"},
47
+ {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/icedwind/reddit_dataset_226", "Number of rows": "303 MLN"},
48
+ {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_123", "Number of rows": "1.12 MLN"},
49
+ {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/chris241/reddit_dataset_75", "Number of rows": "132 MLN"},
50
+ {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_242", "Number of rows": "130 MLN"},
51
+ {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/mgrtsv/reddit_dataset_231", "Number of rows": "31.2 MLN"},
52
+ {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_9", "Number of rows": "26.9 MLN"}
53
+ ]
54
+
55
+ # Load datasets
56
+ datasets = load_datasets()
57
+ df = pd.DataFrame(datasets)
58
+
59
+ # Display statistics
60
+ col1, col2 = st.columns(2)
61
+ with col1:
62
+ total_rows = sum(float(str(rows).split()[0].replace(',', '')) for rows in df['Number of rows'])
63
+ st.metric("Total Rows", f"{total_rows:.2f} Million")
64
+ with col2:
65
+ st.metric("Total Datasets", len(df))
66
+
67
+ # Display the dataset table
68
+ st.subheader("Dataset Overview")
69
+ st.dataframe(
70
+ df,
71
+ column_config={
72
+ "Source": st.column_config.TextColumn("Source"),
73
+ "DataSet repo link": st.column_config.LinkColumn("Repository"),
74
+ "Number of rows": st.column_config.TextColumn("Rows"),
75
+ },
76
+ hide_index=True,
77
+ use_container_width=True
78
+ )
79
+
80
+ # Add a note about the size of the datasets
81
+ st.info("⚠️ These datasets are very large, totaling approximately 100GB. They are not available for direct download through this interface. Please visit the individual dataset links for more information on accessing the data.")
82
+
83
+ # Add instructions for using the datasets
84
+ st.subheader("How to Use These Datasets")
85
+ code = '''
86
+ from datasets import load_dataset
87
+
88
+ dataset = load_dataset("username/dataset_name")
89
+ '''
90
+ st.code(code, language='python')
91
+
92
+ st.markdown("""
93
+ 1. Click on the dataset link to visit its Hugging Face page.
94
+ 2. On the dataset page, you'll find information about the dataset's content, structure, and usage.
95
+ 3. Use the code above to load a dataset, replacing `"username/dataset_name"` with the actual dataset identifier.
96
+ 4. For these large datasets, consider using streaming or loading specific subsets to manage memory usage.
97
+ 5. Always check the dataset's license and usage restrictions before incorporating it into your project.
98
+ """)
99
+
100
+ # Footer
101
+ st.markdown("---")
102
+ st.markdown("Created by Macrocosmos with ❤️ ")
macrocosmos-black.png ADDED
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ pandas
2
+ streamlit
3
+ streamlit-card