Spaces:

qibin0506
/

Cortex

Running

App Files Files Community

Cortex / process_data.py

qibin0506

Upload 16 files

147ef0f verified about 1 month ago

raw

history blame contribute delete

16.2 kB

	import os.path
	import re
	import json

	from sklearn.utils import shuffle
	from llm_trainer import TrainerTools
	from constant import *
	import pickle
	import itertools
	import pandas as pd

	def _init():
	from utils import init_env
	init_env()
	os.environ["TOKENIZERS_PARALLELISM"] = "true"


	def _remove_urls(text: str):
	url_pattern = re.compile(r'(?:(?:https?\|ftp):\/\/)?[\w\/\-?=%.]+\.[\w\/\-&?=%.]+')
	return url_pattern.sub('', text)


	def _remove_brackets(text: str):
	return (text.replace('[]', '')
	.replace('{}', '')
	.replace('()', '')
	.replace('<>', '')
	.replace('【】', '')
	.replace('《》', '')
	.replace('（）', '')
	.replace('（，）', '')
	# .replace('\"\"', '')
	# .replace("\'\'", '')
	)


	def _filter_content(content: str) -> str:
	content = _remove_brackets(_remove_urls(content))
	content = content.replace("{{assistant_name}}", assistant_name)
	return content


	def _extra_think_and_answer(text: str):
	match = re.search(r"<think>(.?)</think>(.)", text, re.DOTALL)
	# 提取 <think> 和 </think> 中间的内容 (第一个捕获组)
	think_data = match.group(1)
	# 提取 </think> 后面的内容 (第二个捕获组)
	content = match.group(2)
	if '<answer>' in content and '</answer>' in content:
	match = re.search(r"<answer>(.?)</answer>(.)", content, re.DOTALL)
	# 提取 <think> 和 </think> 中间的内容 (第一个捕获组)
	content = match.group(1)

	return think_data, content


	def preprocess_wikipedia():
	print('preprocess_wikipedia')
	encoded = []

	with open('./data/raw/wikipedia-cn-20230720-filtered.json', 'r') as f:
	json_ = json.loads(f.read())
	for item in json_:
	item = TrainerTools().tokenizer.encode(f"{item['completion']}{TrainerTools().tokenizer.text_end}")
	encoded.append(item)

	with open(f'./data/tmp/wikipedia.pkl', 'wb') as f:
	pickle.dump(encoded, f)


	def preprocess_cmm_math():
	print('preprocess_cmm_math')
	def is_empty(text):
	return len(text) == 0 or text == 'null'

	result = []
	with open('./data/raw/CMM-Math.jsonl', 'r') as f:
	for line in f:
	json_ = json.loads(line)
	if len(json_['image']) == 0:
	question = json_['question']
	options = json_['options']
	analysis = json_['analysis']
	answer = json_['answer']

	content = f'{question}\n'
	if not is_empty(options):
	content += f'{options}\n'

	if not is_empty(analysis):
	content += f'{analysis}\n'

	if not is_empty(answer):
	content += f'答案：{answer}'

	content = f'{content}{TrainerTools().tokenizer.text_end}'
	result.append(TrainerTools().tokenizer.encode(content))

	with open('./data/tmp/cmm_math.pkl', 'wb') as f:
	pickle.dump(result, f)


	def sample_github_code():
	print('sample_github_code')
	from modelscope import dataset_snapshot_download
	encoded = []

	# 只是有一个文件中的1/4
	include_files = ['train-00019-of-01126.parquet']

	for include_file in include_files:
	dataset_snapshot_download(
	'swift/github-code',
	allow_file_pattern=[f'data/{include_file}'],
	local_dir=f'./data/tmp/'
	)

	local_file_name = f'./data/tmp/data/{include_file}'
	df = pd.read_parquet(local_file_name, engine="pyarrow")
	values = df['content'].values[:len(df['content'].values)//4]

	for v in values:
	v = f'{v}{TrainerTools().tokenizer.text_end}'
	encoded.append(TrainerTools().tokenizer.encode(v.strip()))

	with open(f'./data/tmp/github_code.pkl', 'wb') as f:
	pickle.dump(encoded, f)


	def preprocess_pretrain_data():
	tag_list = ['zh', 'en']
	short_thresholds = [1536, 3072]

	for file_idx in range(len(tag_list)):
	result_short = []
	result_long = []
	tokens_count_short = 0
	tokens_count_long = 0
	suffix_short = 0
	suffix_long = 0

	file = f'./data/raw/sft_data_{tag_list[file_idx]}.jsonl'
	print(f'encode file {file}')

	with open(file, 'r') as f:
	for idx, line in enumerate(f):
	json_ = json.loads(line)
	history = ''
	for his in json_['history']:
	if len(his) != 0:
	history = f'{history}{"\n".join(his)}'

	if len(history) == 0:
	item = _filter_content(
	f"{json_['input'].strip()}\n{json_['output'].strip()}{TrainerTools().tokenizer.text_end}")
	else:
	item = _filter_content(
	f"{history}{json_['input'].strip()}\n{json_['output'].strip()}{TrainerTools().tokenizer.text_end}")

	item = TrainerTools().tokenizer.encode(item.strip())
	item_count = len(item)

	if item_count > short_thresholds[file_idx]:
	result_long.append(item)
	tokens_count_long += item_count
	else:
	result_short.append(item)
	tokens_count_short += item_count

	if tokens_count_long >= 4e8:
	with open(f'./data/tmp/pretrain_long_{tag_list[file_idx]}_{suffix_long}.pkl', 'wb') as f:
	pickle.dump(result_long, f)
	result_long.clear()
	tokens_count_long = 0
	suffix_long += 1

	if tokens_count_short >= 4e8:
	with open(f'./data/tmp/pretrain_short_{tag_list[file_idx]}_{suffix_short}.pkl', 'wb') as f:
	pickle.dump(result_short, f)
	result_short.clear()
	tokens_count_short = 0
	suffix_short += 1

	with open(f'./data/tmp/pretrain_short_{tag_list[file_idx]}.pkl', 'wb') as f:
	pickle.dump(result_short, f)

	with open(f'./data/tmp/pretrain_long_{tag_list[file_idx]}.pkl', 'wb') as f:
	pickle.dump(result_long, f)


	def get_self_cognition(add_think_tag=False):
	result = []

	with open('./data/raw/self_cognition.jsonl', 'r') as f:
	for line in f:
	json_ = json.loads(line)
	user = f"{json_['query']}"

	if add_think_tag:
	user = f"{user} /no think"

	content = json_['response'].replace('{{AUTHOR}}', developer_name).replace('{{NAME}}', assistant_name)

	chat_template = [
	{'role': 'system', 'content': " "},
	{'role': 'user', 'content': user},
	{'role': 'assistant', 'think': ' ', 'content': f"{content.strip()}"}
	]

	encoded = TrainerTools().tokenizer.apply_chat_template(chat_template)
	result.append(encoded)

	return result


	def merge_pretrain_data():
	print('start merge short data')
	# 将en_0 merge到zh_0和zh_1中
	with open('./data/tmp/pretrain_short_en_0.pkl', 'rb') as f:
	en = pickle.load(f)
	en_0_mid = len(en) // 2
	en_0 = en[:en_0_mid]
	en_1 = en[en_0_mid:]
	del en

	merge_froms = [en_0, en_1]
	merge_tos = [0, 1]

	for merge_from, merge_to in zip(merge_froms, merge_tos):
	result = merge_from
	with open(f'./data/tmp/pretrain_short_zh_{merge_to}.pkl', 'rb') as f:
	to_content = pickle.load(f)
	result.extend(to_content)

	flat_result = list(itertools.chain.from_iterable(shuffle(result)))
	with open(f'./data/pretrain_short_{merge_to}.pkl', 'wb') as f:
	pickle.dump(flat_result, f)

	short_zh_list = [
	'pretrain_short_zh_2.pkl',
	'pretrain_short_zh_3.pkl',
	'pretrain_short_zh_4.pkl',
	'pretrain_short_zh_5.pkl',
	'pretrain_short_zh_6.pkl',
	'pretrain_short_zh.pkl',
	]

	short_en_list = [
	'pretrain_short_en_1.pkl',
	'pretrain_short_en_2.pkl',
	'pretrain_short_en_3.pkl',
	'pretrain_short_en_4.pkl',
	'pretrain_short_en_5.pkl',
	'pretrain_short_en.pkl',
	]

	for idx in range(len(short_zh_list)):
	result = []

	with open(f'./data/tmp/{short_zh_list[idx]}', 'rb') as f:
	zh = pickle.load(f)
	result.extend(zh)
	del zh

	with open(f'./data/tmp/{short_en_list[idx]}', 'rb') as f:
	en = pickle.load(f)
	result.extend(en)
	del en

	flat_result = list(itertools.chain.from_iterable(shuffle(result)))
	with open(f'./data/pretrain_short_{idx + 2}.pkl', 'wb') as f:
	pickle.dump(flat_result, f)

	del flat_result

	print('start merge long data')
	long_list = [
	'pretrain_long_en_0.pkl',
	'pretrain_long_en.pkl',
	'pretrain_long_zh_0.pkl',
	'pretrain_long_zh.pkl',
	'cmm_math.pkl',
	'wikipedia.pkl',
	'github_code.pkl'
	]

	result = []
	for idx in range(len(long_list)):
	with open(f'./data/tmp/{long_list[idx]}', 'rb') as f:
	temp = pickle.load(f)
	result.extend(temp)

	result = shuffle(result)
	results = [result[:len(result)//2], result[len(result)//2:]]

	for idx, result in enumerate(results):
	print(f'start dump long {idx}')

	flat_result = list(itertools.chain.from_iterable(result))
	with open(f'./data/pretrain_long_{idx}.pkl', 'wb') as f:
	pickle.dump(flat_result, f)

	print(f'end dump long {idx}')

	print('finish...')


	def preprocess_cot_data():
	result = get_self_cognition()

	print('encode distill_r1_110k_sft')
	with open('./data/raw/distill_r1_110k_sft.jsonl', 'r') as f:
	for line in f:
	json_ = json.loads(line)
	user = json_['instruction']
	output = json_['output']

	think, content = _extra_think_and_answer(output)
	think = _filter_content(think)
	content = _filter_content(content)

	chat_template = [
	{'role': 'system', 'content': " "},
	{'role': 'user', 'content': user.strip()},
	{'role': 'assistant', 'think': think.strip(), 'content': content.strip()}
	]

	encoded = TrainerTools().tokenizer.apply_chat_template(chat_template)
	if len(encoded) > 2048:
	continue

	result.append(encoded)

	print('encode alpaca_r1_data_zh-localpost')
	with open('./data/raw/alpaca_r1_data_zh-localpost.json', 'r') as f:
	json_ = json.loads(f.read())
	for line in json_:
	user = line['instruction']
	output = line['output']

	think, content = _extra_think_and_answer(output)
	think = _filter_content(think)
	content = _filter_content(content)

	chat_template = [
	{'role': 'system', 'content': " "},
	{'role': 'user', 'content': user},
	{'role': 'assistant', 'think': think.strip(), 'content': content.strip()}
	]

	encoded = TrainerTools().tokenizer.apply_chat_template(chat_template)
	if len(encoded) > 2048:
	continue

	result.append(encoded)

	result = shuffle(result)

	print('dump')
	with open('./data/cot_sft.pkl', 'wb') as f:
	pickle.dump(result, f)


	def preprocess_grpo_data():
	qas = []
	for file_name in ['train-00000-of-00001.parquet', 'test-00000-of-00001.parquet']:
	df = pd.read_parquet(f"./data/raw/gsm8k_chinese/{file_name}", engine="pyarrow")
	for q, a in zip(df['question_zh-cn'].values, df['answer_only'].values):
	q_template = [
	{'role': 'system', 'content': " "},
	{'role': 'user', 'content': f'{str(q)}'}
	]

	prompt = TrainerTools().tokenizer.apply_chat_template(q_template)
	if len(prompt) > 2048:
	continue

	qas.append({
	'prompt': prompt,
	'answer': TrainerTools().tokenizer.encode(str(a))
	})

	qas = shuffle(qas)
	with open(f'./data/grpo.pkl', 'wb') as f:
	pickle.dump(qas, f)


	def preprocess_mix_data():
	# 添加自我认知数据集
	# 加入/think 和 /no think
	result = get_self_cognition(True)

	with open('./data/raw/r1_mix_1024.jsonl', 'r') as f:
	for line in f:
	json_ = json.loads(line)
	conversations = json_['conversations']

	chat_template = [{'role': 'system', 'content': " "}]
	for conversation in conversations:
	if conversation['role'] == 'user':
	chat_template.append({'role': 'user', 'content': conversation['content'].strip()})
	elif conversation['role'] == 'assistant':
	if 'think' in conversation['content']:
	chat_template[-1]['content'] = f"{chat_template[-1]['content']} /think"
	chat_template.append({'role': 'assistant', 'content': _filter_content(conversation['content'].strip())})
	else:
	chat_template[-1]['content'] = f"{chat_template[-1]['content']} /no think"
	chat_template.append({'role': 'assistant', 'think': ' ', 'content': f"<answer>{_filter_content(conversation['content'].strip())}</answer>"})

	encoded = TrainerTools().tokenizer.apply_chat_template(chat_template, add_answer_tag_for_assistant=False)
	if len(encoded) > 2048:
	continue

	result.append(encoded)

	result = shuffle(result)
	print('dump')
	with open('./data/mix_sft.pkl', 'wb') as f:
	pickle.dump(result, f)


	def preprocess_dpo_data():
	dpo_list = []

	for file_item in ['dpo_zh.json', 'dpo_en.json']:
	with open(f'./data/raw/dpo/{file_item}', 'r') as f:
	json_ = json.loads(f.read())

	for item in json_:
	system = " "

	conversations = item['conversations']

	chosen = item['chosen']
	rejected = item['rejected']

	chat_template = [{'role': 'system', 'content': system}]
	for conversation in conversations:
	if conversation['from'] == 'system':
	continue

	if conversation['from'] == 'human':
	chat_template.append({'role': 'user', 'content': f"{conversation['value']} /no think"})
	else:
	chat_template.append({'role': 'assistant', 'think': ' ', 'content': _filter_content(conversation['value'])})

	chosen_template = []
	chosen_template.extend(chat_template)
	chosen_template.append({'role': 'assistant', 'think':' ', 'content': _filter_content(chosen['value'])})

	rejected_template = []
	rejected_template.extend(chat_template)
	rejected_template.append({'role': 'assistant', 'think':' ', 'content': _filter_content(rejected['value'])})

	chosen = TrainerTools().tokenizer.apply_chat_template(chosen_template)
	rejected = TrainerTools().tokenizer.apply_chat_template(rejected_template)
	if len(chosen) > 2048 or len(rejected) > 2048:
	continue

	encode_item = {
	'chosen': chosen,
	'rejected': rejected,
	}

	dpo_list.append(encode_item)

	dpo_list = shuffle(dpo_list)
	with open(f'./data/dpo.pkl', 'wb') as f:
	pickle.dump(dpo_list, f)


	if __name__ == '__main__':
	_init()

	sample_github_code()
	preprocess_wikipedia()
	preprocess_cmm_math()
	preprocess_pretrain_data()
	merge_pretrain_data()
	preprocess_cot_data()
	preprocess_grpo_data()
	preprocess_mix_data()
	preprocess_dpo_data()