File size: 1,342 Bytes
4de3b20 79eec1d 4de3b20 79eec1d 4de3b20 79eec1d 4de3b20 79eec1d 4de3b20 79eec1d 4de3b20 79eec1d 4de3b20 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 |
import json
import re
from dataset import SimpleTokenizr
from tqdm import tqdm
tokenizer = SimpleTokenizr()
def filterdata(data):
filtered = []
unused = []
low_quality = []
long = []
filtered_lines = 0
unused_lines = 0
low_quality_lines = 0
long_lines = 0
for line in tqdm(data, unit='B', unit_scale=True, unit_divisor=1024):
decoded = json.dumps(line)
data = json.loads(decoded)
text = data.get("text","")
encoded = tokenizer.tokenize(text)
if re.search(r"\d",text):
unused_lines += 1
unused.append(line)
else:
if len(encoded) >= 64:
filtered_lines += 1
filtered.append(line)
if len(encoded) < 64:
long_lines += 1
long.append(text)
print(f"Filtered {filtered_lines} successfully!")
print(f"Removed {unused_lines} from data.")
print(f"Removed {long_lines} from data (too short).")
#print(f"Removed {low_quality} from data (low quality).")
with open("./data/filtered_data.jsonl", "w", encoding="utf-8") as f:
for lines in filtered:
dump = json.dumps(lines)
decoded = json.loads(dump)
f.write(json.dumps(decoded,ensure_ascii=False) + "\n") |