Spaces:

mnhatdaous
/

learnable-speech

Sleeping

learnable-speech / speech /tools /S3Tokenizer /s3tokenizer /cli.py

Ubuntu

update tokenizer

24d0b1d 5 months ago

14.4 kB

	# Copyright (c) 2024 Tsinghua Univ. (authors: Xingchen Song)
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	""" Example Usage
	cpu:

	s3tokenizer --root_path /path/to/audio/files \
	--model speech_tokenizer_v1 \
	--device "cpu" \
	--batch_size 32

	gpu:

	torchrun --nproc_per_node=8 --nnodes=1 \
	--rdzv_id=2024 --rdzv_backend="c10d" --rdzv_endpoint="localhost:0" \
	`which s3tokenizer` --root_path /path/to/audio/files \
	--model speech_tokenizer_v1 \
	--device "cuda" \
	--batch_size 32

	"""

	import argparse
	import os

	import torch
	import torch.distributed as dist
	from torch.utils.data import DataLoader, Dataset, DistributedSampler
	from tqdm import tqdm

	import s3tokenizer


	class AudioDataset(Dataset):

	def __init__(self, root_path, extensions=['.wav', '.flac', '.mp3'],
	use_cache=True, cache_file=None, max_workers=8):
	self.data = []

	# Define cache file path
	if cache_file is None:
	cache_file = os.path.join(root_path, '.audio_file_cache.pkl')

	# Try to load from cache first
	if use_cache and os.path.exists(cache_file):
	import pickle
	print(f"Loading file list from cache: {cache_file}")
	try:
	with open(cache_file, 'rb') as f:
	self.data = pickle.load(f)
	print(f"Loaded {len(self.data)} files from cache")
	return
	except Exception as e:
	print(f"Failed to load cache: {e}, scanning directory...")

	# Method 1: Use os.walk() which is typically faster than pathlib
	print(f"Scanning directory: {root_path}")
	print(f"Looking for extensions: {extensions}")

	import os
	from concurrent.futures import ThreadPoolExecutor, as_completed

	def scan_directory(args):
	dirpath, extensions = args
	files = []
	try:
	with os.scandir(dirpath) as entries:
	for entry in entries:
	if entry.is_file() and any(entry.name.endswith(ext) for ext in extensions):
	files.append(entry.path)
	except PermissionError:
	pass
	return files

	# Collect all directories first
	all_dirs = [root_path]
	for dirpath, dirnames, _ in os.walk(root_path):
	all_dirs.extend(os.path.join(dirpath, d) for d in dirnames)

	# Process directories in parallel
	with ThreadPoolExecutor(max_workers=max_workers) as executor:
	futures = [executor.submit(scan_directory, (d, extensions)) for d in all_dirs]

	with tqdm(total=len(all_dirs), desc="Scanning directories") as pbar:
	for future in as_completed(futures):
	self.data.extend(future.result())
	pbar.update(1)

	# Sort for consistent ordering
	self.data.sort()

	if len(self.data) == 0:
	raise ValueError(f"No audio files found in {root_path}")

	print(f"Found {len(self.data)} audio files")

	# Save to cache
	if use_cache:
	try:
	import pickle
	print(f"Saving file list to cache: {cache_file}")
	# Ensure parent directory exists
	cache_dir = os.path.dirname(cache_file)
	if cache_dir and not os.path.exists(cache_dir):
	os.makedirs(cache_dir, exist_ok=True)
	with open(cache_file, 'wb') as f:
	pickle.dump(self.data, f)
	except Exception as e:
	print(f"Failed to save cache: {e}")

	def __len__(self):
	return len(self.data)

	def __getitem__(self, idx):
	file_path = self.data[idx]
	try:
	audio = s3tokenizer.load_audio(file_path)
	mel = s3tokenizer.log_mel_spectrogram(audio)
	return file_path, mel
	except Exception as e:
	print(f"Error processing {file_path}: {e}")
	return None, None


	def collate_fn(batch):
	# Filter out None entries (failed files)
	batch = [item for item in batch if item[0] is not None]

	if len(batch) == 0:
	return [], None, None

	file_paths = [item[0] for item in batch]
	mels = [item[1] for item in batch]
	mels, mels_lens = s3tokenizer.padding(mels)
	return file_paths, mels, mels_lens


	def init_distributed():
	world_size = int(os.environ.get('WORLD_SIZE', 1))
	local_rank = int(os.environ.get('LOCAL_RANK', 0))
	rank = int(os.environ.get('RANK', 0))
	print('Inference on multiple gpus, this gpu {}'.format(local_rank) +
	', rank {}, world_size {}'.format(rank, world_size))
	torch.cuda.set_device(local_rank)
	dist.init_process_group("nccl")
	return world_size, local_rank, rank


	def get_args():
	parser = argparse.ArgumentParser(description='extract speech code')
	parser.add_argument('--model',
	required=True,
	type=str,
	choices=[
	"speech_tokenizer_v1", "speech_tokenizer_v1_25hz",
	"speech_tokenizer_v2_25hz"
	],
	help='model version')
	parser.add_argument('--root_path',
	required=True,
	type=str,
	help='root directory containing audio files')
	parser.add_argument('--device',
	required=True,
	type=str,
	choices=["cuda", "cpu"],
	help='device for inference')
	parser.add_argument('--batch_size',
	required=True,
	type=int,
	help='batch size (per-device) for inference')
	parser.add_argument('--num_workers',
	type=int,
	default=4,
	help='workers for dataloader')
	parser.add_argument('--prefetch',
	type=int,
	default=5,
	help='prefetch for dataloader')
	parser.add_argument('--extensions',
	nargs='+',
	default=['.wav', '.flac', '.mp3'],
	help='audio file extensions to process')
	parser.add_argument('--use_cache',
	action='store_true',
	help='use cached file list to avoid re-scanning')
	parser.add_argument('--no_cache',
	action='store_true',
	help='force re-scan even if cache exists')
	parser.add_argument('--cache_file',
	type=str,
	default=None,
	help='path to cache file (default: root_path/.audio_file_cache.pkl)')
	parser.add_argument('--scan_workers',
	type=int,
	default=8,
	help='number of workers for directory scanning')
	parser.add_argument('--file_list',
	type=str,
	default=None,
	help='path to pre-generated file list (one file per line)')
	parser.add_argument('--skip_existing',
	action='store_true',
	help='skip files that already have _fsq.pt output')
	args = parser.parse_args()
	return args


	def save_tokens(file_path, codes, codes_len):
	"""Save tokens as .pt file with _fsq suffix"""
	# Remove extension and add _fsq.pt
	base_name = os.path.splitext(file_path)[0]
	output_path = f"{base_name}_fsq.pt"

	# Extract only valid codes (up to codes_len)
	valid_codes = codes[:codes_len]

	# Save as tensor
	torch.save(valid_codes, output_path)

	return output_path


	def main():
	args = get_args()

	if args.device == "cuda":
	assert (torch.cuda.is_available())
	world_size, local_rank, rank = init_distributed()
	else:
	world_size, local_rank, rank = 1, 0, 0

	device = torch.device(args.device)
	model = s3tokenizer.load_model(args.model).to(device)

	# Handle different data loading methods
	if args.file_list:
	# Option 3: Load from pre-generated file list
	print(f"Loading file list from: {args.file_list}")
	with open(args.file_list, 'r') as f:
	file_paths = []
	for line in f:
	line = line.strip()
	if line:
	file_paths.append(line)

	# Create a simple dataset
	class FileListDataset(Dataset):
	def __init__(self, file_paths, skip_existing=False):
	self.data = []
	skipped_existing = 0
	for fp in file_paths:
	if skip_existing:
	output_path = fp.replace('.wav', '_fsq.pt')
	if os.path.exists(output_path):
	print(f'*******skip file {output_path}')
	skipped_existing += 1
	continue
	self.data.append(fp)
	print(f"Will process {len(self.data)} files")
	if skip_existing and skipped_existing > 0:
	print(f"Skipped {skipped_existing} already processed files")

	def __len__(self):
	return len(self.data)

	def __getitem__(self, idx):
	file_path = self.data[idx]
	try:
	# Check if file exists
	if not os.path.exists(file_path):
	print(f"File not found: {file_path}")
	return None, None


	# Try to load audio
	audio = s3tokenizer.load_audio(file_path)
	mel = s3tokenizer.log_mel_spectrogram(audio)
	return file_path, mel
	except Exception as e:
	print(f"Error processing {file_path}: {e}")
	return None, None

	dataset = FileListDataset(file_paths, skip_existing=args.skip_existing)
	else:
	# Use the enhanced AudioDataset with caching
	dataset = AudioDataset(
	args.root_path,
	args.extensions,
	use_cache=not args.no_cache,
	cache_file=args.cache_file,
	max_workers=args.scan_workers
	)

	# Filter out existing files if requested
	if args.skip_existing:
	original_count = len(dataset.data)
	dataset.data = [
	fp for fp in dataset.data
	if not os.path.exists(os.path.join(os.path.dirname(fp), f"{os.path.splitext(os.path.basename(fp))[0]}_fsq.pt"))
	]
	print(f"Skipping {original_count - len(dataset.data)} already processed files")

	if args.device == "cuda":
	model = torch.nn.parallel.DistributedDataParallel(
	model, device_ids=[local_rank])
	sampler = DistributedSampler(dataset,
	num_replicas=world_size,
	rank=rank)
	else:
	sampler = None

	dataloader = DataLoader(dataset,
	batch_size=args.batch_size,
	sampler=sampler,
	shuffle=False,
	num_workers=args.num_workers,
	prefetch_factor=args.prefetch,
	collate_fn=collate_fn)

	total_steps = len(dataset)

	if rank == 0:
	progress_bar = tqdm(total=total_steps, desc="Processing", unit="wavs")

	processed_count = 0
	failed_count = 0
	failed_files = []

	for file_paths, mels, mels_lens in dataloader:
	# Skip empty batches (all files failed)
	if len(file_paths) == 0:
	continue

	codes, codes_lens = model(mels.to(device), mels_lens.to(device))

	# Process each file in the batch
	for i, file_path in enumerate(file_paths):
	try:
	code = codes[i]
	code_len = codes_lens[i].item()

	# Save tokens as .pt file
	output_path = save_tokens(file_path, code, code_len)

	if rank == 0 and processed_count < 10: # Only show first 10 to avoid spam
	tqdm.write(f"Saved: {file_path} -> {output_path}")

	processed_count += 1
	except Exception as e:
	failed_count += 1
	failed_files.append(file_path)
	if rank == 0:
	tqdm.write(f"Failed to save {file_path}: {e}")

	if rank == 0:
	progress_bar.update(world_size * (len(file_paths) + failed_count))

	if rank == 0:
	progress_bar.close()
	print(f"\nProcessed {processed_count} files successfully on rank {rank}")
	if failed_count > 0:
	print(f"Failed to process {failed_count} files")

	# Save failed files list
	failed_list_path = os.path.join(args.root_path if not args.file_list else ".", "failed_files.txt")
	with open(failed_list_path, 'w') as f:
	for ff in failed_files:
	f.write(f"{ff}\n")
	print(f"Failed files saved to: {failed_list_path}")

	if args.device == "cuda":
	dist.barrier()
	dist.destroy_process_group()


	if __name__ == "__main__":
	main()