Spaces:
Running
Running
victormiller
commited on
Commit
•
c642284
1
Parent(s):
eee4211
Update curated.py
Browse files- curated.py +1 -1
curated.py
CHANGED
@@ -464,7 +464,7 @@ filtering_process = Div(
|
|
464 |
Ol(
|
465 |
Li("Language Filter: any language other than English are discarded"),
|
466 |
Li("Minimum Word Count Filter: less than 500 words (not inclusive) are discarded"),
|
467 |
-
Li("Unigram Log Probablity Filter: Documents were kept if they their average unigram log probability was higher than -20. To calculate the average log word probability, we use word frequencies extracted from the", A("1T Web-gram corpus", href= "https://catalog.ldc.upenn.edu/LDC2006T13"),". Specifically, we use the list available created by"
|
468 |
Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
469 |
),
|
470 |
H4("Local Deduplication Process"),
|
|
|
464 |
Ol(
|
465 |
Li("Language Filter: any language other than English are discarded"),
|
466 |
Li("Minimum Word Count Filter: less than 500 words (not inclusive) are discarded"),
|
467 |
+
Li("Unigram Log Probablity Filter: Documents were kept if they their average unigram log probability was higher than -20. To calculate the average log word probability, we use word frequencies extracted from the", A("1T Web-gram corpus", href= "https://catalog.ldc.upenn.edu/LDC2006T13"),". Specifically, we use the list available created by", A("Rachel Tatman", href="https://www.kaggle.com/datasets/rtatman/english-word-frequency"),"."),
|
468 |
Li("Note: The Frequency Filter was calculated but not applied. The most frequent word in the paper consists of alpha characters only, and it appears in less than 7.5% of the document. Words are obtained by splitting the text on whitespace."),
|
469 |
),
|
470 |
H4("Local Deduplication Process"),
|