omkarenator
commited on
Commit
•
4a437aa
1
Parent(s):
9f2a4f7
fix bf example
Browse files
common.py
CHANGED
@@ -1,6 +1,13 @@
|
|
1 |
from fasthtml.common import *
|
2 |
from fasthtml.components import *
|
3 |
-
from fasthtml.components import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
from fh_plotly import plotly2fasthtml
|
5 |
import pandas as pd
|
6 |
import json
|
@@ -46,48 +53,56 @@ def dup_cluster_graph():
|
|
46 |
return fig
|
47 |
|
48 |
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
"
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
"
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
(
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
|
92 |
|
93 |
def dup_docs_count_graph():
|
@@ -298,7 +313,9 @@ global_div = Div(
|
|
298 |
"To illustrate the need for deduplication, below is the distribution of near-duplicate clusters, organized into buckets of 100. The first bucket contains clusters with sizes ranging from 2 to 100, as found in the Common Crawl dataset. Some clusters even reach up to a million documents."
|
299 |
),
|
300 |
plotly2fasthtml(dup_cluster_graph()),
|
301 |
-
P(
|
|
|
|
|
302 |
Img(src="images/100k.png", style="max-width: 100%;"),
|
303 |
P(
|
304 |
"We started deduplication with 61.8 TB of filtered and compressed documents. The initial dataset had roughly 48.83 billion documents. First, we performed exact deduplication using a Bloom filter with a capacity of 1 billion and a false positive rate of 0.001. This reduced the documents from 48.83 billion to 40.21 billion, removing about 17% as exact duplicates. This step used constant memory for the Bloom filter and lessened the workload for subsequent near-deduplication."
|
@@ -344,7 +361,7 @@ global_div = Div(
|
|
344 |
P(
|
345 |
"There is a high chance that duplicates from different bands will have the same pairs in the same horizontal partition. Performing the Bloom filter step reduces the number of pairs by nearly ninefold."
|
346 |
),
|
347 |
-
|
348 |
P(
|
349 |
"The resulting unique pairs are then used to identify clusters of near-duplicates by finding connected components in a graph, where the vertices represent documents and the edges represent matches."
|
350 |
),
|
|
|
1 |
from fasthtml.common import *
|
2 |
from fasthtml.components import *
|
3 |
+
from fasthtml.components import (
|
4 |
+
D_title,
|
5 |
+
D_article,
|
6 |
+
D_front_matter,
|
7 |
+
D_contents,
|
8 |
+
D_byline,
|
9 |
+
D_cite,
|
10 |
+
)
|
11 |
from fh_plotly import plotly2fasthtml
|
12 |
import pandas as pd
|
13 |
import json
|
|
|
53 |
return fig
|
54 |
|
55 |
|
56 |
+
def dedup_pairs_bands():
|
57 |
+
return pd.DataFrame(
|
58 |
+
{
|
59 |
+
"Bloom Filter": [
|
60 |
+
"BF 0",
|
61 |
+
"",
|
62 |
+
"",
|
63 |
+
"",
|
64 |
+
"BF 1",
|
65 |
+
"",
|
66 |
+
"BF 8",
|
67 |
+
],
|
68 |
+
"Band 0": [
|
69 |
+
"(A,B)",
|
70 |
+
"(C,D)",
|
71 |
+
"(E,K)",
|
72 |
+
"(B,K)",
|
73 |
+
"...",
|
74 |
+
"...",
|
75 |
+
"...",
|
76 |
+
],
|
77 |
+
"Band 1": [
|
78 |
+
"(A,B)",
|
79 |
+
"(C,D)",
|
80 |
+
"(F,K)",
|
81 |
+
"(B,K)",
|
82 |
+
"...",
|
83 |
+
"...",
|
84 |
+
"...",
|
85 |
+
],
|
86 |
+
"....": [
|
87 |
+
"...",
|
88 |
+
"...",
|
89 |
+
"...",
|
90 |
+
"...",
|
91 |
+
"...",
|
92 |
+
"...",
|
93 |
+
"...",
|
94 |
+
],
|
95 |
+
"Band 8": [
|
96 |
+
"(A,B)",
|
97 |
+
"(C,D)",
|
98 |
+
"(D,E)",
|
99 |
+
"(E,K)",
|
100 |
+
"(B,K)",
|
101 |
+
"...",
|
102 |
+
"...",
|
103 |
+
],
|
104 |
+
}
|
105 |
+
).to_html(index=False, border=0)
|
106 |
|
107 |
|
108 |
def dup_docs_count_graph():
|
|
|
313 |
"To illustrate the need for deduplication, below is the distribution of near-duplicate clusters, organized into buckets of 100. The first bucket contains clusters with sizes ranging from 2 to 100, as found in the Common Crawl dataset. Some clusters even reach up to a million documents."
|
314 |
),
|
315 |
plotly2fasthtml(dup_cluster_graph()),
|
316 |
+
P(
|
317 |
+
"The example below is from one such cluster. Here most of the text is repeated with just specifics changed."
|
318 |
+
),
|
319 |
Img(src="images/100k.png", style="max-width: 100%;"),
|
320 |
P(
|
321 |
"We started deduplication with 61.8 TB of filtered and compressed documents. The initial dataset had roughly 48.83 billion documents. First, we performed exact deduplication using a Bloom filter with a capacity of 1 billion and a false positive rate of 0.001. This reduced the documents from 48.83 billion to 40.21 billion, removing about 17% as exact duplicates. This step used constant memory for the Bloom filter and lessened the workload for subsequent near-deduplication."
|
|
|
361 |
P(
|
362 |
"There is a high chance that duplicates from different bands will have the same pairs in the same horizontal partition. Performing the Bloom filter step reduces the number of pairs by nearly ninefold."
|
363 |
),
|
364 |
+
Div(NotStr(dedup_pairs_bands()), style="margin: 40px;"),
|
365 |
P(
|
366 |
"The resulting unique pairs are then used to identify clusters of near-duplicates by finding connected components in a graph, where the vertices represent documents and the edges represent matches."
|
367 |
),
|