Spaces:

mshukor
/

UnIVAL

Running

App Files Files Community

UnIVAL / preprocess /utils.py

mshukor

init

26fd00c about 1 year ago

raw

history blame contribute delete

No virus

28.1 kB

	import os
	import json
	# import torch
	# import clip
	from PIL import Image
	# import sng_parser
	from tqdm import tqdm
	import codecs
	import numpy as np
	import csv
	import sys

	from io import BytesIO
	import base64
	import pickle

	from random import shuffle

	import ast
	# uniq-id, image (base64 string), caption, question, answer,
	#ground-truth objects (objects appearing in the caption or question),
	#dataset name (source of the data) and task type (caption, qa or visual gronunding).


	# import subprocess
	from multiprocessing import Pool
	# import shutil
	try:
	from psutil import cpu_count
	except:
	from multiprocessing import cpu_count

	from functools import partial


	def remove_special(input_string):
	final_string = ""
	for character in input_string:
	if character == " ":
	final_string = final_string + character
	else:
	if(character.isalnum()):
	final_string = final_string + character
	return final_string

	def convert_img_to_str(file_name):
	img = Image.open(file_name) # path to file
	img_buffer = BytesIO()
	img.save(img_buffer, format=img.format)
	byte_data = img_buffer.getvalue()
	base64_str = base64.b64encode(byte_data) # bytes
	base64_str = base64_str.decode("utf-8") # str
	return base64_str

	def add_new_tsv(original_tsv_path, new_tsv, output_path):


	tsv = []
	with open(original_tsv_path) as file:
	tsv_file = csv.reader(file, delimiter='\t')
	for line in tqdm(tsv_file):
	tsv.append(line)
	start_id = len(tsv)+1

	print(start_id)
	for d in tqdm(new_tsv):
	d[0] = d[0] + start_id
	tsv.append(d)
	shuffle(tsv)

	with open(output_path, 'w', newline='') as f_output:
	csv_output = csv.writer(f_output, delimiter='\t')

	for t in tqdm(tsv):
	csv_output.writerow(t)

	return tsv


	def get_tsv_caption_data_from_json(original_data, start_id, dataset_name, task_type, convert_images=True):
	tsv_data = []
	for i, d in tqdm(enumerate(original_data)):
	caption = remove_special(d['caption'])
	img_path = d['image']
	if convert_images:
	img = convert_img_to_str(img_path)
	else:
	img_path = img_path.replace('/data/mshukor/data/', '')
	img = img_path
	t = [start_id, img, caption, '','', '', dataset_name, task_type]
	tsv_data.append(t)
	start_id+=1

	return tsv_data


	def get_tsv_caption_data_from_video_json(original_data, start_id, dataset_name, task_type, convert_images=True, prefix=None):
	tsv_data = []
	for i, d in tqdm(enumerate(original_data)):
	caption = remove_special(d['caption'])
	if prefix is not None:
	img_path = os.path.join(prefix, d['video'])

	if convert_images:
	img = convert_img_to_str(img_path)
	else:
	img_path = img_path.replace('/data/mshukor/data/', '')
	img = img_path
	t = [start_id, img, caption, '','', '', dataset_name, task_type]
	tsv_data.append(t)
	start_id+=1

	return tsv_data

	def get_tsv_vqa_data_from_json(original_data, start_id, dataset_name, task_type, image_root=None, convert_images=True):
	tsv_data = []
	for i, d in tqdm(enumerate(original_data)):
	question = remove_special(d['question'])
	img_path = d['image']
	if 'COCO_' in img_path:
	img_path = os.path.join('coco/', img_path)

	if image_root is not None:
	img_path = os.path.join(image_root, img_path)

	if convert_images:
	img = convert_img_to_str(img_path)
	else:
	img_path = img_path.replace('/data/mshukor/data/', '')
	img = img_path

	answers = set(d['answer'])

	answer_weight = {}
	for ans in answers:
	ans = remove_special(ans)
	if ans in answer_weight.keys():
	answer_weight[ans] += 1/len(answers)
	else:
	answer_weight[ans] = 1/len(answers)

	ans_ = ["{:.1f}".format(conf)+'\|!+'+ans for ans, conf in answer_weight.items()]
	ans_ = '&&'.join(ans_)

	t = [start_id, img, '', question, ans_, '', dataset_name, task_type]
	tsv_data.append(t)
	start_id+=1
	shuffle(tsv_data)
	return tsv_data


	def get_tsv_vqa_synth_data_from_json(original_data, start_id, dataset_name, task_type, image_root=None, convert_images=True, data_type='all'):
	tsv_data = []
	for i, d in tqdm(enumerate(original_data)):
	if data_type == 'manual' and 'manual' in d['dataset']:
	pass
	elif data_type == 'auto' and 'manual' not in d['dataset']:
	pass
	elif data_type == 'all':
	pass
	else:
	continue


	question = remove_special(d['question'])
	img_path = d['image']
	if 'COCO_' in img_path:
	img_path = os.path.join('coco/', img_path)

	if image_root is not None:
	img_path = os.path.join(image_root, img_path)

	if convert_images:
	img = convert_img_to_str(img_path)
	else:
	img_path = img_path.replace('/data/mshukor/data/', '')
	img = img_path

	answers = set(d['answer'])

	answer_weight = {}
	for ans in answers:
	ans = remove_special(ans)
	if ans in answer_weight.keys():
	answer_weight[ans] += 1/len(answers)
	else:
	answer_weight[ans] = 1/len(answers)

	ans_ = ["{:.1f}".format(conf)+'\|!+'+ans for ans, conf in answer_weight.items()]
	ans_ = '&&'.join(ans_)

	t = [start_id, img, '', question, ans_, '', dataset_name, task_type]
	tsv_data.append(t)
	start_id+=1
	shuffle(tsv_data)
	return tsv_data

	def get_tsv_from_vg_grounding(regions, data, start_id, dataset_name='visual_genome', task_type='visual_grounding', convert_images=True, split='train', thresh=16384):

	original_data = json.load(open(regions,'r'))

	image_data = json.load(open(data,'r'))



	id_2_imagepath = {}

	for d in tqdm(image_data):
	id_ = int(d['image'].split('/')[-1].split('.')[0])
	id_2_imagepath[id_] = d['image']

	tsv_data = []
	for d in tqdm(original_data):
	img_path = id_2_imagepath[d['id']]
	if convert_images:
	img = convert_img_to_str(img_path)
	else:
	img_path = img_path.replace('/data/mshukor/data/', '')
	img = img_path

	for reg in d['regions']:
	width = reg['width']
	height = reg['height']
	x = reg['x']
	y = reg['y']
	area = width*height
	if area < thresh:
	x1, y1, x2, y2 = x, y, x + width, y + height # top left, bottom right
	box = '{:.2f},{:.2f},{:.2f},{:.2f}'.format(x1, y1, x2, y2)
	sentence = remove_special(reg['phrase'])
	t = [start_id, img, sentence, '', box, '', dataset_name, task_type]
	tsv_data.append(t)
	start_id+=1
	shuffle(tsv_data)
	return tsv_data

	def get_tsv_from_refcoco(ref_path, instances_path, start_id, dataset_name='refcoco_train', task_type='visual_grounding', convert_images=True, split='train'):

	refs = pickle.load(open(ref_path, 'rb'))
	instances = json.load(open(instances_path,'r'))

	id_to_annot = {}
	for annot in tqdm(instances['annotations']):
	id_to_annot[annot['id']] = annot

	id_to_images = {}
	for annot in tqdm(instances['images']):
	id_to_images[annot['id']] = annot

	tsv_data = []
	for ref in tqdm(refs):
	ref_split = ref['split']
	if ref_split == split:
	image_id = ref['image_id']
	file_name = id_to_images[ref['image_id']]['file_name']
	if ref_split == 'train':
	file_name = os.path.join('coco/train2014', file_name)

	if convert_images:
	img_path = os.path.join('/data/mshukor/data/', file_name)
	img = convert_img_to_str(img_path)
	else:
	img_path = file_name.replace('/data/mshukor/data/', '')
	img = img_path

	ann_id = ref['ann_id']
	annot = id_to_annot[ann_id]
	bbox = annot['bbox'] # x,y,w,h bottom left
	x1, y1, x2, y2 = bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3] # top left, bottom right
	box = '{:.2f},{:.2f},{:.2f},{:.2f}'.format(x1, y1, x2, y2)
	for sent in ref['sentences']:
	sentence = remove_special(sent['sent'])
	# [id, image, 'third book starting from left', '', '29.1,11.72,66.81,343.41', '', 'refcoco_train', 'visual_grounding']
	t = [start_id, img, sentence, '', box, '', dataset_name, task_type]

	tsv_data.append(t)
	start_id+=1
	shuffle(tsv_data)
	return tsv_data

	def get_tsv_data_from_jsons(datasets, start_id, task_types, image_root=None, convert_images=True):
	tsvs = []
	for (original_data_path, task_type) in zip(datasets, task_types):
	print(task_type)
	if task_type == 'caption':
	dataset_name = original_data_path.split('/')[-1].split('.')[0]
	print(dataset_name,'start_id:', start_id)
	original_data = json.load(open(original_data_path,'r'))
	tsvs += get_tsv_caption_data_from_json(original_data=original_data, start_id=start_id, dataset_name=dataset_name, task_type=task_type, convert_images=convert_images)
	elif task_type == 'qa':
	dataset_name = original_data_path.split('/')[-1].split('.')[0]
	print(dataset_name,'start_id:', start_id)
	original_data = json.load(open(original_data_path,'r'))
	tsvs += get_tsv_vqa_data_from_json(original_data=original_data, start_id=start_id, dataset_name=dataset_name, task_type=task_type, image_root=image_root, convert_images=convert_images)

	elif task_type == 'visual_grounding':
	dataset_name = original_data_path[0].split('/')[-2].replace('+', '')+'_train'
	print(dataset_name,'start_id:', start_id)
	if dataset_name == 'refcoco_train':
	tsvs += get_tsv_from_refcoco(original_data_path[0], original_data_path[1], start_id, dataset_name=dataset_name, task_type=task_type, convert_images=convert_images, split='train')

	elif task_type == 'detection':
	dataset_name = original_data_path[0]
	if dataset_name == 'vg':
	tsvs+= get_tsv_from_vg_detection(original_data_path[1], original_data_path[2], start_id, convert_images=convert_images, split='train')
	elif dataset_name == 'coco':
	tsvs+= get_tsv_from_coco_detection(original_data_path[1], start_id, convert_images=convert_images, split='train')

	else:
	raise
	start_id = tsvs[-1][0] + 1
	shuffle(tsvs)
	return tsvs



	def create_imagenet_txt_files(path_data, output_path, dataset='imagenet'):
	data = []
	# start_id = 0
	for root, dirs, files, in os.walk(path_data):
	for d in tqdm(dirs):
	dir_path = os.path.join(root, d)
	for _, _, dir_files in os.walk(dir_path):
	for f in dir_files:
	file_path = os.path.join(dir_path, f)
	if dataset == 'imagenet21k':
	file_path = '/'.join(file_path.split('/')[-3:])
	elif dataset == 'openimages':
	file_path = '/'.join(file_path.split('/')[-4:])
	elif dataset == 'yfcc':
	file_path = '/'.join(file_path.split('/')[-5:])
	elif dataset == 'imagenet':
	file_path = '/'.join(file_path.split('/')[-5:])
	else:
	file_path = '/'.join(file_path.split('/')[-4:])
	image_id = f.split('.')[0]
	tmp = [image_id, file_path]
	data.append(tmp)
	# start_id+=1

	with open(output_path, 'w', newline='') as f_output:
	csv_output = csv.writer(f_output, delimiter='\t')

	for t in tqdm(data):
	csv_output.writerow(t)



	def get_tsv_from_vg_detection(instances_path, path_images, start_id, convert_images=True, split='train'):
	print('start id:', start_id)
	instances = json.load(open(instances_path,'r'))

	id_to_objects = {}
	for d in instances:
	id_to_objects[d['id']] = d



	id_to_image_path = {}
	for root, dirs, files, in os.walk(path_images):
	for d in dirs:
	dir_path = os.path.join(root, d)
	for _, _, dir_files in os.walk(dir_path):
	for f in dir_files:
	file_path = os.path.join(dir_path, f)
	file_path = '/'.join(file_path.split('/')[-4:])
	image_id = f.split('.')[0]
	id_to_image_path[image_id] = file_path




	tsv_data = []
	missied = []
	for ref in tqdm(id_to_image_path.keys()):
	ref_split = split

	image_id = ref

	file_name = id_to_image_path[image_id]
	if convert_images:
	img_path = os.path.join('/data/mshukor/data/', file_name)
	img = convert_img_to_str(img_path)
	else:
	img_path = file_name.replace('/data/mshukor/data/', '')
	img = img_path


	if int(image_id) in id_to_objects:
	objects = id_to_objects[int(image_id)]['objects']
	else:
	missied.append(image_id)
	continue

	if len(objects) == 0:
	missied.append(image_id)
	continue


	areas = []
	detections = []
	for annot in objects:
	x,y,w,h = annot['x'], annot['y'], annot['w'], annot['h'] # x,y,w,h bottom left

	area = w*h

	x1, y1, x2, y2 = x, y, x + w, y + h # top left, bottom right

	x1 = max(0, x1)
	x2 = max(0, x2)


	category = ','.join(remove_special(annot['names'])).replace('\x00','')
	object_id = annot['id']


	tmp = '{:.3f},{:.3f},{:.3f},{:.3f},{},{}'.format(x1, y1, x2, y2, object_id, category)
	detections.append(tmp)
	areas.append(area)

	sorted_indices = sorted(range(len(areas)), key=lambda k: areas[k], reverse=True)
	detections = [detections[k] for k in sorted_indices]

	detections = '&&'.join(detections)
	t = [start_id, img, detections]

	tsv_data.append(t)
	start_id+=1
	print('missed images:', len(missied))
	shuffle(tsv_data)
	return tsv_data



	def get_tsv_from_coco_detection(instances_path, start_id, convert_images=True, split='train'):
	print('start id:', start_id)
	instances = json.load(open(instances_path,'r'))
	imgid_to_annot = {}
	for annot in tqdm(instances['annotations']):
	if annot['image_id'] not in imgid_to_annot:
	imgid_to_annot[annot['image_id']] = [annot]
	else:
	imgid_to_annot[annot['image_id']].append(annot)

	id_to_category = {}
	for annot in tqdm(instances['categories']):
	id_to_category[annot['id']] = annot['name']

	tsv_data = []
	missied = []
	for ref in tqdm(instances['images']):
	ref_split = split
	image_id = ref['id']
	file_name = ref['file_name']

	if ref_split == 'train':
	file_name = os.path.join('coco/train2014', file_name)

	if convert_images:
	img_path = os.path.join('/data/mshukor/data/', file_name)
	img = convert_img_to_str(img_path)
	else:
	img_path = file_name.replace('/data/mshukor/data/', '')
	img = img_path

	# ann_id = ref['id']
	# annot = id_to_annot[ann_id]
	if image_id not in imgid_to_annot:
	missied.append(image_id)
	continue
	annots = imgid_to_annot[image_id]
	detections = []
	areas = []
	for annot in annots:
	bbox = annot['bbox'] # x,y,w,h bottom left
	area = bbox[2]*bbox[3]
	x1, y1, x2, y2 = bbox[0], bbox[1], bbox[0] + bbox[2], bbox[1] + bbox[3] # top left, bottom right
	# box = '{:.3f},{:.3f},{:.3f},{:.3f}'.format(x1, y1, x2, y2)

	object_id = annot['category_id']
	category = remove_special(id_to_category[object_id])

	tmp = '{:.3f},{:.3f},{:.3f},{:.3f},{},{}'.format(x1, y1, x2, y2, object_id, category)
	areas.append(area)
	detections.append(tmp)

	sorted_indices = sorted(range(len(areas)), key=lambda k: areas[k], reverse=True)
	detections = [detections[k] for k in sorted_indices]
	detections = '&&'.join(detections)
	t = [start_id, img, detections]

	tsv_data.append(t)
	start_id+=1
	shuffle(tsv_data)
	return tsv_data

	def imagepath_to_image_size(img_path, dir_path):
	img_path = os.path.join(dir_path, img_path)
	w, h = Image.open(img_path).size
	# imageid_to_meta_dict[img_path] = [w, h]
	return w, h, img_path


	def save_imageid_to_meta_dict(path_images, output_path, mp=False, num_workers=1):

	id_to_image_path = {}
	for file in os.listdir(path_images):
	file_path = os.path.join(path_images, file)
	file_path = '/'.join(file_path.split('/')[-4:])
	image_id = file.split('.')[0]
	id_to_image_path[image_id] = file

	imageid_to_meta_dict = {}

	if mp:
	iterable = list(id_to_image_path.values())
	mp_func = partial(imagepath_to_image_size, dir_path=path_images,)

	num_cores = cpu_count()
	num_workers = num_workers
	print(f"Begin with {num_cores}-core logical processor, {num_workers} workers")
	with Pool(num_workers) as pool, tqdm(total=len(iterable), desc="running") as pbar:
	for idx, res in enumerate(pool.imap_unordered(mp_func, iterable, chunksize=32)):

	w, h, img_path = res

	imageid_to_meta_dict[img_path] = [w, h]
	pbar.update(1)
	else:
	for k, p in tqdm(id_to_image_path.items()):


	w, h, img_path = imagepath_to_image_size(path_images, p)

	imageid_to_meta_dict[img_path] = [w, h]


	print(len(imageid_to_meta_dict))
	with open(output_path, 'w') as f:
	json.dump(imageid_to_meta_dict, f)

	return imageid_to_meta_dict


	def get_tsv_from_openimages_detection(instances_path, path_images, class_path,
	start_id, convert_images=False, split='train', image_root='/gpfsdswork/dataset', image_meta=None):

	id_to_image_path = {}
	# for root, dirs, files, in os.walk(path_images):
	# for d in dirs:
	# dir_path = os.path.join(root, d)
	# print(dir_path)
	# for _, _, dir_files in os.walk(dir_path):
	# for f in dir_files:
	# print(f)
	# file_path = os.path.join(dir_path, f)
	# file_path = '/'.join(file_path.split('/')[-4:])
	# image_id = f.split('.')[0]
	# id_to_image_path[image_id] = file_path

	for file in os.listdir(path_images):
	file_path = os.path.join(path_images, file)
	file_path = '/'.join(file_path.split('/')[-4:])
	image_id = file.split('.')[0]
	id_to_image_path[image_id] = file

	def imagepath_to_image_size(img_path):
	w, h = Image.open(img_path).size
	return w, h

	id_to_annot = {}
	with open(instances_path) as file:
	tsv_file = csv.reader(file, delimiter='\t')
	for i, line in tqdm(enumerate(tsv_file)):
	if i == 0:
	continue # skip header
	img_id = line[0].split(',')[0]
	if img_id in id_to_annot:
	id_to_annot[img_id].append(line)
	else:
	id_to_annot[img_id] = [line]

	classid_to_class = {}

	with open(class_path) as file:
	tsv_file = csv.reader(file, delimiter=',')
	for i, line in tqdm(enumerate(tsv_file)):
	classid_to_class[line[0]] = line[1]

	if image_meta is not None:
	image_size = json.load(open(image_meta, 'r'))
	else:
	image_size = None

	tsv_data = []
	for i, img_id in tqdm(enumerate(id_to_annot.keys())):
	annots = id_to_annot[img_id]
	if img_id in id_to_image_path:
	img_path = id_to_image_path[img_id]
	orig_img_path = os.path.join(path_images, img_path)

	save_img_path = os.path.join(image_root, img_path)

	if image_size is None:
	w, h = imagepath_to_image_size(orig_img_path)
	else:
	w, h = image_size[orig_img_path]

	if convert_images:
	img = convert_img_to_str(orig_img_path)
	else:
	img = save_img_path

	areas = []
	detections = []
	for d in annots:
	d = d[0].split(',')

	x1, x2, y1, y2 = d[4:8]
	x1, x2, y1, y2 = float(x1), float(x2), float(y1), float(y2)

	x1, x2, y1, y2 = x1w, x2w, y1h, y2h
	box_w, box_h = x2 - x1, y2 - y1
	area = box_w*box_h
	areas.append(area)

	object_id = d[2]
	category = remove_special(classid_to_class[object_id])

	tmp = '{:.3f},{:.3f},{:.3f},{:.3f},{},{}'.format(x1, y1, x2, y2, object_id, category)
	detections.append(tmp)


	sorted_indices = sorted(range(len(areas)), key=lambda k: areas[k], reverse=True)
	detections = [detections[k] for k in sorted_indices]

	detections = '&&'.join(detections)
	t = [start_id, img, detections]

	tsv_data.append(t)
	start_id+=1

	shuffle(tsv_data)
	return tsv_data



	def replace_image_id_by_path(input_tsv, output_tsv, mapping_file):
	selected_cols='0,1,2'
	data = []
	selected_col_ids = [int(col_id) for col_id in selected_cols.split(",")]
	with open(input_tsv) as file:
	tsv_file = csv.reader(file, delimiter='\t')
	for line in tqdm(tsv_file):
	d = [line[i] for i in selected_col_ids]
	data.append(d)

	im_id_to_path = {}
	with open(mapping_file) as file:
	tsv_file = csv.reader(file, delimiter='\t')
	for line in tqdm(tsv_file):
	d = [line[i] for i in [0, 1]]
	im_id_to_path[d[0]] = d[1]

	for d in tqdm(data):
	im_id = d[1].split('/')[-1].split('.')[0]
	im_path = im_id_to_path[im_id]
	d[1] = im_path

	with open(output_tsv, 'w', newline='') as f_output:
	csv_output = csv.writer(f_output, delimiter='\t')

	for t in tqdm(data):
	csv_output.writerow(t)

	return data



	####3 video

	def get_tsv_msrvtt_vqa_data_from_json(original_data, start_id, image_root=None, convert_images=False):
	tsv_data = []
	for i, d in tqdm(enumerate(original_data)):
	question = remove_special(d['question'])+'?'
	img_path = d['video']

	img_id = img_path.split('.')[0]


	if image_root is not None:
	img_path = os.path.join(image_root, img_path)

	if convert_images:
	img = convert_img_to_str(img_path)
	else:
	img_path = img_path.replace('/data/mshukor/data/', '')
	img = img_path

	answer = remove_special(d['answer'])

	conf = 1.0

	ans = "{:.1f}".format(conf)+'\|!+'+answer


	t = [start_id, img_id, question, ans, '', img]
	tsv_data.append(t)
	start_id+=1
	shuffle(tsv_data)
	return tsv_data



	def get_tsv_msrvtt_caption_data_from_json(original_data, start_id, image_root=None, convert_images=False):
	tsv_data = []
	for i, d in tqdm(enumerate(original_data)):
	caption = d['caption']
	if isinstance(caption, list):
	cs = [remove_special(c) for c in caption]
	caption = '&&'.join(cs)
	else:
	caption = remove_special(caption)
	img_path = d['video']
	img_id = img_path.split('.')[0]

	if image_root is not None:
	img_path = os.path.join(image_root, img_path)

	if convert_images:
	img = convert_img_to_str(img_path)
	else:
	img_path = img_path.replace('/data/mshukor/data/', '')
	img = img_path

	t = [start_id, img_id, caption, '', img]

	tsv_data.append(t)
	start_id+=1

	shuffle(tsv_data)
	return tsv_data



	######3 Pile


	def get_tsv_from_piletext_data(path, output_path, start_id=0, num_max_characters=2500, dataset_names=None, keepspecial=False):
	print("consider only", dataset_names)

	tsv = []
	failed = 0
	with open(output_path, 'w', newline='') as f_output:
	csv_output = csv.writer(f_output, delimiter='\t')


	with open(path, "rb") as f:
	for d in tqdm(f):
	d_str = d.decode("UTF-8")
	d_dict = ast.literal_eval(d_str)
	data_name = d_dict['meta']['pile_set_name']
	text = d_dict['text'][:num_max_characters]

	if dataset_names is not None and data_name in dataset_names:

	text = text.replace('\t', ' ').replace("\n", ' ').replace('\"', '')
	if not keepspecial:
	text = remove_special(text)
	item = [start_id, text]
	try:
	csv_output.writerow(item)
	except: # (UnicodeEncodeError,SyntaxError)
	failed+=1
	continue

	start_id+=1
	tsv.append(item)
	print("len", len(tsv), "failed", failed)
	return tsv




	def save_pile_tsvs(path, output_path, dataset_names, dir_names=None, keepspecial=False, num_max_characters=1500, prefix=''):

	print('prepare:', dir_names)

	for filename in os.listdir(path):
	if dir_names is not None and filename in dir_names:
	input_path = os.path.join(path, filename)
	if 'jsonl' in filename:
	output_file_name = filename.split('.')[0]+prefix+'_pile.tsv'
	output_file_name = os.path.join(output_path, output_file_name)
	print("creating:", output_file_name, "from", input_path)
	tsv = get_tsv_from_piletext_data(input_path, output_file_name, start_id=0, num_max_characters=num_max_characters,
	dataset_names=dataset_names, keepspecial=keepspecial)

	return tsv


	def add_pile_tsvs(path, output_path='pile_all.tsv', key='pile.tsv'):

	start_id = 0
	with open(output_path, 'w', newline='') as f_output:
	csv_output = csv.writer(f_output, delimiter='\t')

	for filename in os.listdir(path):
	input_path = os.path.join(path, filename)
	if key in filename:

	with open(input_path) as file:
	tsv_file = csv.reader((line.replace('\0','') for line in file), delimiter='\t')
	for line in tqdm(tsv_file):
	line[0] = start_id
	csv_output.writerow(line)
	start_id+=1
	print('start id', line[0])
	# return tsv