vtrubamacrocosmos commited on
Commit
a4d754c
1 Parent(s): 64a090d

fix billion display

Browse files
Files changed (1) hide show
  1. app.py +22 -15
app.py CHANGED
@@ -59,9 +59,9 @@ st.markdown("Explore massive datasets hosted on Hugging Face, totaling approxima
59
  # Function to load dataset information
60
  @st.cache_data
61
  def load_datasets():
62
- return [
63
  # Reddit datasets
64
- {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_69", "Number of rows": "6 MLN"},
65
  {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/mgrtsv/reddit_dataset_229", "Number of rows": "44,815,182"},
66
  {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_88", "Number of rows": "253,506,882"},
67
  {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_218", "Number of rows": "562,042"},
@@ -71,13 +71,13 @@ def load_datasets():
71
  {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/dataverse-scraping/reddit_dataset_71", "Number of rows": "259,924,884"},
72
  {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_209", "Number of rows": "209,698,975"},
73
  {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_218", "Number of rows": "7,064,613"},
74
- {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/dataverse-scraping/reddit_dataset_192", "Number of rows": "249 MLN"},
75
- {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/icedwind/reddit_dataset_226", "Number of rows": "303 MLN"},
76
- {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_123", "Number of rows": "1.12 MLN"},
77
- {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/chris241/reddit_dataset_75", "Number of rows": "132 MLN"},
78
- {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_242", "Number of rows": "130 MLN"},
79
- {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/mgrtsv/reddit_dataset_231", "Number of rows": "31.2 MLN"},
80
- {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_9", "Number of rows": "26.9 MLN"},
81
 
82
  # X datasets
83
  {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/littleGuagua/x_dataset_0", "Number of rows": "331,611,777"},
@@ -88,20 +88,27 @@ def load_datasets():
88
  {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/x_dataset_218", "Number of rows": "1,753,878"},
89
  {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/SAVE0x0/x_dataset_191", "Number of rows": "92,588"},
90
  {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/johnny8188/x_dataset_187", "Number of rows": "52,762"},
91
- {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/icedwind/x_dataset_19", "Number of rows": "332 MLN"},
92
- {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/wenknow/x_dataset", "Number of rows": "9.9 K"},
93
- {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_123", "Number of rows": "89 K"}
94
  ]
 
 
 
 
 
95
 
96
  # Load datasets
97
  datasets = load_datasets()
98
  df = pd.DataFrame(datasets)
99
 
 
 
 
100
  # Display statistics
101
- col1, col2, col3 = st.columns(3)
102
  with col1:
103
- total_rows = sum(float(str(rows).split()[0].replace(',', '')) for rows in df['Number of rows'])
104
- st.metric("Total Rows", f"{total_rows / 1000:.2f}B")
105
  with col2:
106
  st.metric("Total Datasets", len(df))
107
 
 
59
  # Function to load dataset information
60
  @st.cache_data
61
  def load_datasets():
62
+ datasets = [
63
  # Reddit datasets
64
+ {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_69", "Number of rows": "6000000"},
65
  {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/mgrtsv/reddit_dataset_229", "Number of rows": "44,815,182"},
66
  {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_88", "Number of rows": "253,506,882"},
67
  {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_218", "Number of rows": "562,042"},
 
71
  {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/dataverse-scraping/reddit_dataset_71", "Number of rows": "259,924,884"},
72
  {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_209", "Number of rows": "209,698,975"},
73
  {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_218", "Number of rows": "7,064,613"},
74
+ {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/dataverse-scraping/reddit_dataset_192", "Number of rows": "249000000"},
75
+ {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/icedwind/reddit_dataset_226", "Number of rows": "303000000"},
76
+ {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_123", "Number of rows": "1120000"},
77
+ {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/chris241/reddit_dataset_75", "Number of rows": "132000000"},
78
+ {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/wenknow/reddit_dataset_242", "Number of rows": "130000000"},
79
+ {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/mgrtsv/reddit_dataset_231", "Number of rows": "31200000"},
80
+ {"Source": "Reddit", "DataSet repo link": "https://huggingface.co/datasets/PlanAPlanB/reddit_dataset_9", "Number of rows": "26900000"},
81
 
82
  # X datasets
83
  {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/littleGuagua/x_dataset_0", "Number of rows": "331,611,777"},
 
88
  {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/x_dataset_218", "Number of rows": "1,753,878"},
89
  {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/SAVE0x0/x_dataset_191", "Number of rows": "92,588"},
90
  {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/johnny8188/x_dataset_187", "Number of rows": "52,762"},
91
+ {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/icedwind/x_dataset_19", "Number of rows": "332000000"},
92
+ {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/wenknow/x_dataset", "Number of rows": "9900"},
93
+ {"Source": "X", "DataSet repo link": "https://huggingface.co/datasets/arrmlet/reddit_dataset_123", "Number of rows": "89000"}
94
  ]
95
+ return datasets
96
+
97
+ # Function to convert row count to float
98
+ def parse_row_count(row_count):
99
+ return float(row_count.replace(',', ''))
100
 
101
  # Load datasets
102
  datasets = load_datasets()
103
  df = pd.DataFrame(datasets)
104
 
105
+ # Calculate total rows
106
+ total_rows = sum(parse_row_count(rows) for rows in df['Number of rows'])
107
+
108
  # Display statistics
109
+ col1, col2 = st.columns(2)
110
  with col1:
111
+ st.metric("Total Rows", f"{total_rows / 1e9:.2f}B")
 
112
  with col2:
113
  st.metric("Total Datasets", len(df))
114