import json | |
import re | |
from dataset import SimpleTokenizr | |
from tqdm import tqdm | |
tokenizer = SimpleTokenizr() | |
def filterdata(data): | |
filtered = [] | |
unused = [] | |
low_quality = [] | |
long = [] | |
filtered_lines = 0 | |
unused_lines = 0 | |
low_quality_lines = 0 | |
long_lines = 0 | |
for line in tqdm(data, unit='B', unit_scale=True, unit_divisor=1024): | |
decoded = json.dumps(line) | |
data = json.loads(decoded) | |
text = data.get("text","") | |
encoded = tokenizer.tokenize(text) | |
if re.search(r"\d",text): | |
unused_lines += 1 | |
unused.append(line) | |
else: | |
if len(encoded) >= 64: | |
filtered_lines += 1 | |
filtered.append(line) | |
if len(encoded) < 64: | |
long_lines += 1 | |
long.append(text) | |
print(f"Filtered {filtered_lines} successfully!") | |
print(f"Removed {unused_lines} from data.") | |
print(f"Removed {long_lines} from data (too short).") | |
#print(f"Removed {low_quality} from data (low quality).") | |
with open("./data/filtered_data.jsonl", "w", encoding="utf-8") as f: | |
for lines in filtered: | |
dump = json.dumps(lines) | |
decoded = json.loads(dump) | |
f.write(json.dumps(decoded,ensure_ascii=False) + "\n") |