hunterhector commited on
Commit
ddc7526
1 Parent(s): a1ddc25

fix sankey

Browse files
Files changed (2) hide show
  1. main.py +17 -17
  2. web.py +72 -19
main.py CHANGED
@@ -52,7 +52,7 @@ front_matter = {
52
  },
53
  {
54
  "author": "Nikhil Ranjan",
55
- "authorURL": "https://huggingface.co/NikhilRanjan",
56
  "affiliation": "MBZUAI",
57
  "affiliationURL": "",
58
  },
@@ -64,56 +64,56 @@ front_matter = {
64
  },
65
  {
66
  "author": "Zhen Wang",
67
- "authorURL": "https://huggingface.co/ZhenWang",
68
  "affiliation": "MBZUAI",
69
  "affiliationURL": "",
70
  },
71
  {
72
  "author": "An Li",
73
- "authorURL": "https://huggingface.co/AnLi",
74
- "affiliation": "",
75
  "affiliationURL": "",
76
  },
77
  {
78
  "author": "Zhoujun Cheng",
79
- "authorURL": "https://huggingface.co/ZhoujunCheng",
80
- "affiliation": "",
81
  "affiliationURL": "",
82
  },
83
  {
84
  "author": "Suqi Sun",
85
- "authorURL": "https://huggingface.co/SuqiSun",
86
  "affiliation": "Petuum, Inc.",
87
  "affiliationURL": "",
88
  },
89
  {
90
  "author": "Cun Mu",
91
- "authorURL": "https://huggingface.co/CunMu",
92
- "affiliation": "",
93
  "affiliationURL": "",
94
  },
95
  {
96
  "author": "Victor Miller",
97
- "authorURL": "https://huggingface.co/VictorMiller",
98
- "affiliation": "",
99
  "affiliationURL": "",
100
  },
101
  {
102
  "author": "Yue Peng",
103
- "authorURL": "https://huggingface.co/YuePeng",
104
- "affiliation": "",
105
  "affiliationURL": "",
106
  },
107
  {
108
  "author": "Eric P. Xing",
109
- "authorURL": "https://huggingface.co/EricXing",
110
- "affiliation": "MBZUAI & CMU",
111
  "affiliationURL": "https://www.mbzuai.ac.ae/ & https://www.cs.cmu.edu/",
112
  },
113
  {
114
  "author": "Zhengzhong Liu",
115
- "authorURL": "https://huggingface.co/ZhengzhongLiu",
116
- "affiliation": "",
117
  "affiliationURL": "",
118
  },
119
  ],
 
52
  },
53
  {
54
  "author": "Nikhil Ranjan",
55
+ "authorURL": "https://huggingface.co/nikhilranjan",
56
  "affiliation": "MBZUAI",
57
  "affiliationURL": "",
58
  },
 
64
  },
65
  {
66
  "author": "Zhen Wang",
67
+ "authorURL": "",
68
  "affiliation": "MBZUAI",
69
  "affiliationURL": "",
70
  },
71
  {
72
  "author": "An Li",
73
+ "authorURL": "https://huggingface.co/an1118",
74
+ "affiliation": "UCSD",
75
  "affiliationURL": "",
76
  },
77
  {
78
  "author": "Zhoujun Cheng",
79
+ "authorURL": "https://huggingface.co/zhoujun",
80
+ "affiliation": "UCSD",
81
  "affiliationURL": "",
82
  },
83
  {
84
  "author": "Suqi Sun",
85
+ "authorURL": "https://huggingface.co/mylibrar",
86
  "affiliation": "Petuum, Inc.",
87
  "affiliationURL": "",
88
  },
89
  {
90
  "author": "Cun Mu",
91
+ "authorURL": "https://huggingface.co/CarisMu",
92
+ "affiliation": "MBZUAI",
93
  "affiliationURL": "",
94
  },
95
  {
96
  "author": "Victor Miller",
97
+ "authorURL": "https://huggingface.co/vamiller12",
98
+ "affiliation": "Petuum, Inc.",
99
  "affiliationURL": "",
100
  },
101
  {
102
  "author": "Yue Peng",
103
+ "authorURL": "https://huggingface.co/Dreamever",
104
+ "affiliation": "MBZUAI",
105
  "affiliationURL": "",
106
  },
107
  {
108
  "author": "Eric P. Xing",
109
+ "authorURL": "",
110
+ "affiliation": "MBZUAI",
111
  "affiliationURL": "https://www.mbzuai.ac.ae/ & https://www.cs.cmu.edu/",
112
  },
113
  {
114
  "author": "Zhengzhong Liu",
115
+ "authorURL": "https://huggingface.co/hunterhector",
116
+ "affiliation": "Petuum, Inc. / MBZUAI ",
117
  "affiliationURL": "",
118
  },
119
  ],
web.py CHANGED
@@ -248,7 +248,7 @@ attrs.fraction_of_characters_in_duplicate_lines = sum(
248
  # Plot the distribution sankey.
249
 
250
  # The filtering percentages
251
- web_filtering_percentages = [
252
  100,
253
  96.98,
254
  43.84,
@@ -264,13 +264,13 @@ web_filtering_percentages = [
264
  web_filtering_steps = [
265
  "Common Crawl",
266
  "Text Extraction",
267
- "Language Identification",
268
  "URL Filtering",
269
  "Repetition Removal",
270
- "Document-wise Filtering",
271
- "Line-wise Corrections",
272
- "Local Exact Deduplication",
273
- "Global Fuzzy Deduplication",
274
  ]
275
 
276
  step_colors = [
@@ -285,6 +285,8 @@ step_colors = [
285
  '#1f773c', # Lightest green added at the end
286
  ]
287
 
 
 
288
  def add_opacity(hex_color, opacity):
289
  # Remove '#' if present
290
  hex_color = hex_color.lstrip('#')
@@ -293,29 +295,80 @@ def add_opacity(hex_color, opacity):
293
  # Add the opacity value
294
  return f"rgba({rgb[0]}, {rgb[1]}, {rgb[2]}, {opacity})"
295
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
 
297
- # Concatenate the percentage to each label
298
- labels_with_percentages = [f"{label} ({percentage}%)" for label, percentage in zip(web_filtering_steps, web_filtering_percentages)]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
 
300
  filtering_sankey_fig = go.Figure(go.Sankey(
301
  node=dict(
302
- label=labels_with_percentages,
303
- color=[add_opacity(c, 0.8) for c in step_colors[:9]] ,
304
  pad=15, # Adjust padding between nodes
305
  thickness=30,
306
  ),
307
  link=dict(
308
- source=list(range(0,8)), # Each source is the previous step
309
- target=list(range(1,9)), # Each target is the next step
310
- value=web_filtering_percentages,
311
- color=[add_opacity(c, 0.5) for c in step_colors[:8]] # Match the link colors to the source node
312
-
313
  )
314
  ))
315
 
316
  filtering_sankey_fig.update_layout(
317
- title_text="Web Data Filtering Process",
318
- font_size=10,
 
 
 
 
 
319
  margin=dict(l=0, r=0, t=40, b=0)
320
  )
321
 
@@ -345,10 +398,10 @@ def web_data():
345
  P("The table below provides a comparison of the quality filters that have been applied to each dataset. Of note, TxT360 does not use any machine learning (ML) based filters. ML filters are a useful and efficient filtering processing that should be consider for any filtering project. However, we are leaving this to future work."),
346
  table_div_qf_filter_data,
347
  P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across snapshots. "),
348
- Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
349
  # The sankey diagram of the filtering percentage
350
  plotly2fasthtml(filtering_sankey_fig),
351
- P("Note: All percentages are based on the number of documents. The gray bars represent the relative percentages of removed documents at each step, while the colorful bars represent the percentages of retained documents relative to the total number of documents in the raw Common Crawl."),
352
  id="section2",),
353
  Section(
354
  H2("Document Preparation"),
 
248
  # Plot the distribution sankey.
249
 
250
  # The filtering percentages
251
+ web_remaining_percent = [
252
  100,
253
  96.98,
254
  43.84,
 
264
  web_filtering_steps = [
265
  "Common Crawl",
266
  "Text Extraction",
267
+ "Language ID",
268
  "URL Filtering",
269
  "Repetition Removal",
270
+ "Document Filtering",
271
+ "Line Corrections",
272
+ "Local Exact Dedup",
273
+ "Global Fuzzy Dedup",
274
  ]
275
 
276
  step_colors = [
 
285
  '#1f773c', # Lightest green added at the end
286
  ]
287
 
288
+ grey_color = "#d3d3d3"
289
+
290
  def add_opacity(hex_color, opacity):
291
  # Remove '#' if present
292
  hex_color = hex_color.lstrip('#')
 
295
  # Add the opacity value
296
  return f"rgba({rgb[0]}, {rgb[1]}, {rgb[2]}, {opacity})"
297
 
298
+ # Create a list for all the node labels, colors, and values
299
+ node_labels = []
300
+ node_colors = []
301
+
302
+ # Create source and target for links
303
+ source = []
304
+ target = []
305
+ link_colors = []
306
+ link_values = []
307
+
308
+ # For each step, we have two nodes: remaining and filtered
309
+ for i, label in enumerate(web_filtering_steps):
310
+ node_labels.append(f"{label} ({web_remaining_percent[i]}%)")
311
+ node_colors.append(add_opacity(step_colors[i], 0.85))
312
+
313
+ if i > 0:
314
+ # Nothing filtered at step 0, set the nodes of the remaining percentages.
315
+ node_labels.append(f"{100 - web_remaining_percent[i]:.2f}%")
316
+ node_colors.append(grey_color)
317
+
318
+ # From the previous remaining part to the current remaining part.
319
+ if i == 1:
320
+ # Nothing got filtered before step 1.
321
+ prev_remain_idx = 0
322
+ curr_remain_idx = 1
323
+ curr_filtered_idx = 2
324
+ else:
325
+ prev_remain_idx = 2 * i - 3
326
+ prev_filtered_idx = 2 * i - 2
327
+ curr_remain_idx = 2 * i - 1
328
+ curr_filtered_idx = 2 * i
329
 
330
+ # Previous remaining -> current remaining
331
+ source.append(prev_remain_idx)
332
+ target.append(curr_remain_idx)
333
+ link_colors.append(add_opacity(step_colors[i-1], 0.7))
334
+ link_values.append(web_remaining_percent[i])
335
+
336
+ # Previous remaining -> current filtered
337
+ source.append(prev_remain_idx)
338
+ target.append(curr_filtered_idx)
339
+ link_colors.append(add_opacity(step_colors[i-1], 0.5))
340
+ link_values.append(web_remaining_percent[i-1] - web_remaining_percent[i])
341
+
342
+ if i > 1:
343
+ # We have data filtered out at step 1, previous filtered -> current filtered
344
+ source.append(prev_filtered_idx)
345
+ target.append(curr_filtered_idx)
346
+ link_colors.append(grey_color)
347
+ link_values.append(100 - web_remaining_percent[i - 1])
348
 
349
  filtering_sankey_fig = go.Figure(go.Sankey(
350
  node=dict(
351
+ label=node_labels,
352
+ color=node_colors,
353
  pad=15, # Adjust padding between nodes
354
  thickness=30,
355
  ),
356
  link=dict(
357
+ source=source, # Source from remaining
358
+ target=target, # Target to filtered
359
+ value=link_values, # Interleaved remaining and filtered values
360
+ color=link_colors
 
361
  )
362
  ))
363
 
364
  filtering_sankey_fig.update_layout(
365
+ title_text="Web Data Filtering Percentage",
366
+ title_x=0.5, # Centers the title
367
+ title_font=dict(
368
+ family="Arial, sans-serif", # Font family
369
+ size=18, # Font size
370
+ ),
371
+ font_size=8,
372
  margin=dict(l=0, r=0, t=40, b=0)
373
  )
374
 
 
398
  P("The table below provides a comparison of the quality filters that have been applied to each dataset. Of note, TxT360 does not use any machine learning (ML) based filters. ML filters are a useful and efficient filtering processing that should be consider for any filtering project. However, we are leaving this to future work."),
399
  table_div_qf_filter_data,
400
  P("Our filtering rate is illustrated below. Before deduplication, our filtering rate is comparable to RefinedWeb. During global deduplication, we removed approximately 85.89% of the data, significantly higher than previous works, indicating a large number of duplicates across snapshots. "),
401
+ # Img(src="images/filter_rate.jpg", height = "300", width = "600" ),
402
  # The sankey diagram of the filtering percentage
403
  plotly2fasthtml(filtering_sankey_fig),
404
+ P("A significant portion of the documents is filtered after the whole process. This figure illustrates the percentage of documents filtered at each step. The grey bars represent the filtered documents. The statistics are largely consistent with prior work (e.g., RefinedWeb) across most steps, though we have incorporated some custom filtering steps."),
405
  id="section2",),
406
  Section(
407
  H2("Document Preparation"),