MiniGPT / filter.py
CreatedNull's picture
Upload folder using huggingface_hub
79eec1d verified
import json
import re
from dataset import SimpleTokenizr
from tqdm import tqdm
tokenizer = SimpleTokenizr()
def filterdata(data):
filtered = []
unused = []
low_quality = []
long = []
filtered_lines = 0
unused_lines = 0
low_quality_lines = 0
long_lines = 0
for line in tqdm(data, unit='B', unit_scale=True, unit_divisor=1024):
decoded = json.dumps(line)
data = json.loads(decoded)
text = data.get("text","")
encoded = tokenizer.tokenize(text)
if re.search(r"\d",text):
unused_lines += 1
unused.append(line)
else:
if len(encoded) >= 64:
filtered_lines += 1
filtered.append(line)
if len(encoded) < 64:
long_lines += 1
long.append(text)
print(f"Filtered {filtered_lines} successfully!")
print(f"Removed {unused_lines} from data.")
print(f"Removed {long_lines} from data (too short).")
#print(f"Removed {low_quality} from data (low quality).")
with open("./data/filtered_data.jsonl", "w", encoding="utf-8") as f:
for lines in filtered:
dump = json.dumps(lines)
decoded = json.loads(dump)
f.write(json.dumps(decoded,ensure_ascii=False) + "\n")