YuLan-Mini Resources
Collection
Pre-Training & post-training resources for YuLan-Mini
•
8 items
•
Updated
•
2
We use math-classifier
to retrieve math-related content from fineweb-edu
, dclm
, ... to upsample math-related content
import json
import os
import time
from concurrent.futures import ProcessPoolExecutor, wait, ALL_COMPLETED
from time import sleep
import fasttext
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
from tqdm import tqdm
def print_error(value):
print("error: ", value)
def data_process(index, file, saved_dir):
try:
model_path = "math_score.bin"
model = fasttext.load_model(model_path)
# saved_dir: fineweb-edu/data/CC...-math/
filename = file.split('/')[-1].replace('.parquet', '.jsonl')
path90 = os.path.join(saved_dir, "09_10", filename)
if os.path.exists(path90):
print("exist", path90, flush=True)
return
sleep(index * 3)
os.makedirs(saved_dir, exist_ok=True)
label_list = []
s67_list = []
s78_list = []
s89_list = []
s90_list = []
st = time.time()
print("reading parquet", file, flush=True)
df = pd.read_parquet(file)
ed = time.time()
print("read parquet time: ", ed - st, flush=True)
for _, row_orginal in tqdm(
df.iterrows(),
total=len(df),
position=index,
desc=filename,
):
row = row_orginal.to_dict()
text = row['text'].replace('\n', ' ')
pred = model.predict(text)
label, score = pred[0][0], pred[1][0]
label_list.append(pred)
if label == '__label__positive':
if 0.6 <= score < 0.7:
s67_list.append(row)
if 0.7 <= score < 0.8:
s78_list.append(row)
elif 0.8 <= score < 0.9:
s89_list.append(row)
elif 0.9 <= score <= 1.0:
s90_list.append(row)
else:
continue
except Exception as e:
print_error(e)
return None
os.makedirs(os.path.join(saved_dir, "labeled"), exist_ok=True)
print("writing to file", flush=True)
with open(
os.path.join(saved_dir, "labeled",
filename.replace('.jsonl', '.txt')), 'w') as f:
f.write("\n".join(str(pred) for pred in label_list))
for dir_name in [ "07_08", "08_09", "09_10"]:
os.makedirs(os.path.join(saved_dir, dir_name), exist_ok=True)
with open(os.path.join(saved_dir, "06_07", filename), 'w') as f:
f.write("\n".join(json.dumps(line_now) for line_now in s67_list))
with open(os.path.join(saved_dir, "07_08", filename), 'w') as f:
f.write("\n".join(json.dumps(line_now) for line_now in s78_list))
with open(os.path.join(saved_dir, "08_09", filename), 'w') as f:
f.write("\n".join(json.dumps(line_now) for line_now in s89_list))
with open(os.path.join(saved_dir, "09_10", filename), 'w') as f:
f.write("\n".join(json.dumps(line_now) for line_now in s90_list))
return None
if __name__ == '__main__':
num_process = 5
start_time = time.time()
file_paths = []
base = "fineweb-edu/data/"
coun=0
for file_name in [
'CC-MAIN-2017-04','CC-MAIN-2017-09','CC-MAIN-2017-13',
'CC-MAIN-2017-17','CC-MAIN-2017-22','CC-MAIN-2017-26',
'CC-MAIN-2017-30','CC-MAIN-2017-34','CC-MAIN-2017-39',
'CC-MAIN-2017-43','CC-MAIN-2017-47','CC-MAIN-2017-51',
"CC-MAIN-2018-05","CC-MAIN-2018-09","CC-MAIN-2018-13",
"CC-MAIN-2018-17","CC-MAIN-2018-22","CC-MAIN-2018-26",
"CC-MAIN-2018-30","CC-MAIN-2018-34","CC-MAIN-2018-39",
"CC-MAIN-2018-43","CC-MAIN-2018-47","CC-MAIN-2018-51",
"CC-MAIN-2019-04","CC-MAIN-2019-09","CC-MAIN-2019-13",
"CC-MAIN-2019-18","CC-MAIN-2019-22","CC-MAIN-2019-26",
"CC-MAIN-2019-30","CC-MAIN-2019-35","CC-MAIN-2019-39",
"CC-MAIN-2019-43","CC-MAIN-2019-47","CC-MAIN-2019-51",
]:
print("Walking:", file_name)
original_file_path = base + file_name
math_dir = original_file_path + "-math"
print(math_dir)
for root, dirs, files in os.walk(original_file_path):
for file in files:
if file.endswith(".parquet"): # 只处理Parquet文件
file_path = os.path.abspath(os.path.join(root, file))
coun+=1
saved_dir = math_dir + "/" + file_path.split("/")[-1][:-8]
print(saved_dir)
file_paths.append((file_path, saved_dir))
print(coun)
print(len(lines))
print("total file paths", len(file_paths))
num_process = min(num_process, len(file_paths))
print("num_process", num_process)
futures = []
with ProcessPoolExecutor(num_process) as executor:
for index, (file_path, saved_dir) in enumerate(file_paths):
futures.append(
executor.submit(data_process, index % num_process, file_path,
saved_dir))
done, not_done = wait(futures, return_when=ALL_COMPLETED)
end_time = time.time()
# 计算并打印所用时间
elapsed_time = end_time - start_time
print(f"Time taken: {elapsed_time} seconds")
print("=" * 100)
We welcome any form of contribution, including feedback on model bad cases, feature suggestions, and example contributions. You can do so by submitting an issue.
YuLan-Mini is developed and maintained by AI Box, Renmin University of China.
If you find YuLan-Mini helpful for your research or development, please cite our technical report:
@article{hu2024yulan,
title={YuLan-Mini: An Open Data-efficient Language Model},
author={Hu, Yiwen and Song, Huatong and Deng, Jia and Wang, Jiapeng and Chen, Jie and Zhou, Kun and Zhu, Yutao and Jiang, Jinhao and Dong, Zican and Zhao, Wayne Xin and others},
journal={arXiv preprint arXiv:2412.17743},
year={2024}
}