victormiller commited on
Commit
230ca5c
1 Parent(s): e759b31

Update overview

Browse files
Files changed (1) hide show
  1. overview +80 -80
overview CHANGED
@@ -44,104 +44,104 @@ dataset_comparison = pd.DataFrame(
44
  "4 Sources",
45
  ],
46
  "Wikipedia": [
47
- "Improves data quality by removing irrelevant documents",
48
- "Filters out low-quality or incomplete documents",
49
- "Provides additional information for analysis",
50
- "Enables language-specific analysis and insights",
51
- "Helps understand the complexity and content of documents",
52
- "Identifies important terms and topics in the dataset",
53
- "Quantifies the importance of individual words",
54
- "RedPajama-v1",
55
  ],
56
  "FreeLaw": [
57
- "May exclude documents in less common languages",
58
- "May remove documents with valuable information",
59
- "May introduce bias in the analysis",
60
- "May not accurately represent the language distribution",
61
- "May not capture the complexity of document structure",
62
- "May be sensitive to noise and outliers",
63
- "May not capture the semantic meaning of words",
64
- "RedPajama-v1",
65
  ],
66
  "DM Math": [
67
- "May exclude documents in less common languages",
68
- "May remove documents with valuable information",
69
- "May introduce bias in the analysis",
70
- "May not accurately represent the language distribution",
71
- "May not capture the complexity of document structure",
72
- "May be sensitive to noise and outliers",
73
- "May not capture the semantic meaning of words",
74
- "RedPajama-v1",
75
  ],
76
  "USPTO": [
77
- "May exclude documents in less common languages",
78
- "May remove documents with valuable information",
79
- "May introduce bias in the analysis",
80
- "May not accurately represent the language distribution",
81
- "May not capture the complexity of document structure",
82
- "May be sensitive to noise and outliers",
83
- "May not capture the semantic meaning of words",
84
- "RedPajama-v1",
85
  ],
86
  "PG-19": [
87
- "May exclude documents in less common languages",
88
- "May remove documents with valuable information",
89
- "May introduce bias in the analysis",
90
- "May not accurately represent the language distribution",
91
- "May not capture the complexity of document structure",
92
- "May be sensitive to noise and outliers",
93
- "May not capture the semantic meaning of words",
94
- "RedPajama-v1",
95
  ],
96
  "HackerNews": [
97
- "May exclude documents in less common languages",
98
- "May remove documents with valuable information",
99
- "May introduce bias in the analysis",
100
- "May not accurately represent the language distribution",
101
- "May not capture the complexity of document structure",
102
- "May be sensitive to noise and outliers",
103
- "May not capture the semantic meaning of words",
104
- "RedPajama-v1",
105
  ],
106
  "Ubuntu IRC": [
107
- "May exclude documents in less common languages",
108
- "May remove documents with valuable information",
109
- "May introduce bias in the analysis",
110
- "May not accurately represent the language distribution",
111
- "May not capture the complexity of document structure",
112
- "May be sensitive to noise and outliers",
113
- "May not capture the semantic meaning of words",
114
- "RedPajama-v1",
115
  ],
116
  "EuroParl": [
117
- "May exclude documents in less common languages",
118
- "May remove documents with valuable information",
119
- "May introduce bias in the analysis",
120
- "May not accurately represent the language distribution",
121
- "May not capture the complexity of document structure",
122
- "May be sensitive to noise and outliers",
123
- "May not capture the semantic meaning of words",
124
- "RedPajama-v1",
125
  ],
126
  "StackExchange": [
127
- "May exclude documents in less common languages",
128
- "May remove documents with valuable information",
129
- "May introduce bias in the analysis",
130
- "May not accurately represent the language distribution",
131
- "May not capture the complexity of document structure",
132
- "May be sensitive to noise and outliers",
133
- "May not capture the semantic meaning of words",
134
- "RedPajama-v1",
135
  ],
136
  "Code": [
137
- "May exclude documents in less common languages",
138
- "May remove documents with valuable information",
139
- "May introduce bias in the analysis",
140
- "May not accurately represent the language distribution",
141
- "May not capture the complexity of document structure",
142
- "May be sensitive to noise and outliers",
143
- "May not capture the semantic meaning of words",
144
- "RedPajama-v1",
145
  ],
146
  }
147
  )
 
44
  "4 Sources",
45
  ],
46
  "Wikipedia": [
47
+ "310+ Languages",
48
+ "-",
49
+ "-",
50
+ "-",
51
+ "-",
52
+ "what does a check mark mean?",
53
+ "what does a check mark mean?",
54
+ "English Only",
55
  ],
56
  "FreeLaw": [
57
+ "Included",
58
+ "-",
59
+ "-",
60
+ "-",
61
+ "-",
62
+ "-",
63
+ "-",
64
+ "Included",
65
  ],
66
  "DM Math": [
67
+ "Included",
68
+ "-",
69
+ "-",
70
+ "-",
71
+ "-",
72
+ "-",
73
+ "-",
74
+ "Included",
75
  ],
76
  "USPTO": [
77
+ "Included",
78
+ "-",
79
+ "-",
80
+ "-",
81
+ "-",
82
+ "-",
83
+ "-",
84
+ "Included",
85
  ],
86
  "PG-19": [
87
+ "Included",
88
+ "-",
89
+ "-",
90
+ "-",
91
+ "-",
92
+ "Included",
93
+ "Included",
94
+ "Included",
95
  ],
96
  "HackerNews": [
97
+ "Included",
98
+ "-",
99
+ "-",
100
+ "-",
101
+ "-",
102
+ "-",
103
+ "-",
104
+ "Included",
105
  ],
106
  "Ubuntu IRC": [
107
+ "Included",
108
+ "-",
109
+ "-",
110
+ "-",
111
+ "-",
112
+ "-",
113
+ "-",
114
+ "Included",
115
  ],
116
  "EuroParl": [
117
+ "Included",
118
+ "-",
119
+ "-",
120
+ "-",
121
+ "-",
122
+ "-",
123
+ "-",
124
+ "Included",
125
  ],
126
  "StackExchange": [
127
+ "Included",
128
+ "-",
129
+ "-",
130
+ "-",
131
+ "-",
132
+ "-",
133
+ "Included",
134
+ "Included",
135
  ],
136
  "Code": [
137
+ "- what is this?",
138
+ "-",
139
+ "-",
140
+ "-",
141
+ "-",
142
+ "Included",
143
+ "Included",
144
+ "Included",
145
  ],
146
  }
147
  )