yhavinga commited on
Commit
5d1b3d0
1 Parent(s): 1586c50

Update clean/clean.py

Browse files
Files changed (1) hide show
  1. clean/clean.py +19 -1
clean/clean.py CHANGED
@@ -1,3 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import functools
2
  import gzip
3
  import hashlib
@@ -127,7 +144,8 @@ def clean_text(text,
127
  counter_inc_fn("filtered:too_few_sentences")
128
  return
129
  counter_inc_fn("passed")
130
- result = "\n".join(valid_lines).strip()
 
131
  return result
132
 
133
 
1
+ # Code adapted from https://github.com/tensorflow/datasets/blob/master/tensorflow_datasets/text/c4_utils.py
2
+ # that has the following license
3
+ # coding=utf-8
4
+ # Copyright 2021 The TensorFlow Datasets Authors.
5
+ #
6
+ # Licensed under the Apache License, Version 2.0 (the "License");
7
+ # you may not use this file except in compliance with the License.
8
+ # You may obtain a copy of the License at
9
+ #
10
+ # http://www.apache.org/licenses/LICENSE-2.0
11
+ #
12
+ # Unless required by applicable law or agreed to in writing, software
13
+ # distributed under the License is distributed on an "AS IS" BASIS,
14
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15
+ # See the License for the specific language governing permissions and
16
+ # limitations under the License.
17
+
18
  import functools
19
  import gzip
20
  import hashlib
144
  counter_inc_fn("filtered:too_few_sentences")
145
  return
146
  counter_inc_fn("passed")
147
+ result = "\
148
+ ".join(valid_lines).strip()
149
  return result
150
 
151