Spaces:
Runtime error
Runtime error
# Split data direcoty into two data direcotries | |
# Copyright 2019 Tomoki Hayashi | |
# MIT License (https://opensource.org/licenses/MIT) | |
# shellcheck disable=SC1091 | |
. ./path.sh || exit 1; | |
shuffle=false | |
num_first=0 | |
num_second=0 | |
# shellcheck disable=SC1091 | |
. utils/parse_options.sh || exit 1; | |
if [ $# -ne 3 ]; then | |
echo "Usage: $0 <src_dir> <dist_dir_1> <dist_dir_2> ..." | |
echo "e.g.: $0 data/all data/train data/deveval" | |
echo "" | |
echo "Options:" | |
echo " --shuffle: Whether to perform shuffle (default=false)." | |
echo " --num_first: Number of utts in the first dist dir." | |
echo " If set to 0, it will be automatically decided (default=0)." | |
echo " --num_second: Number of utts in the second dist dir." | |
echo " If set to 0, it will be automatically decided (default=0)." | |
exit 1 | |
fi | |
set -eu | |
src_dir=$1 | |
first_dist_dir=$2 | |
second_dist_dir=$3 | |
src_scp=${src_dir}/wav.scp | |
if [ -e "${src_dir}/segments" ]; then | |
has_segments=true | |
src_segments=${src_dir}/segments | |
num_src_utts=$(wc -l < "${src_segments}") | |
else | |
has_segments=false | |
num_src_utts=$(wc -l < "${src_scp}") | |
fi | |
# check number of utts | |
if [ "${num_first}" -eq 0 ] && [ "${num_second}" -eq 0 ]; then | |
num_first=$((num_src_utts / 2 )) | |
num_second=$((num_src_utts - num_first)) | |
elif [ "${num_first}" -gt 0 ] && [ "${num_second}" -eq 0 ]; then | |
[ "${num_src_utts}" -le "${num_first}" ] && \ | |
echo "ERROR: num_first must be less than # utts in src. (${num_first} vs ${num_src_utts})" >&2 && \ | |
exit 1 | |
num_second=$((num_src_utts - num_first)) | |
elif [ "${num_first}" -eq 0 ] && [ "${num_second}" -gt 0 ]; then | |
[ "${num_src_utts}" -le "${num_second}" ] && \ | |
echo "ERROR: num_second must be less than # utts in src. (${num_second} vs ${num_src_utts})" >&2 && \ | |
exit 1 | |
num_first=$((num_src_utts - num_second)) | |
elif [ "${num_first}" -gt 0 ] && [ "${num_second}" -gt 0 ]; then | |
[ "${num_src_utts}" -ne "$((num_first + num_second))" ] && \ | |
echo "ERROR: num_first + num_second must be the same # utts in src. ($((num_first + num_second)) vs ${num_src_utts})" >&2 && \ | |
exit 1 | |
fi | |
# check directory existence | |
[ ! -e "${first_dist_dir}" ] && mkdir -p "${first_dist_dir}" | |
[ ! -e "${second_dist_dir}" ] && mkdir -p "${second_dist_dir}" | |
# split | |
if ! "${has_segments}"; then | |
if "${shuffle}"; then | |
sort -R "${src_scp}" > "${src_scp}.unsorted" | |
head -n "${num_first}" "${src_scp}.unsorted" | sort > "${first_dist_dir}/wav.scp" | |
tail -n "${num_second}" "${src_scp}.unsorted" | sort > "${second_dist_dir}/wav.scp" | |
rm "${src_scp}.unsorted" | |
else | |
head -n "${num_first}" "${src_scp}" | sort > "${first_dist_dir}/wav.scp" | |
tail -n "${num_second}" "${src_scp}" | sort > "${second_dist_dir}/wav.scp" | |
fi | |
else | |
# split segments at first | |
if "${shuffle}"; then | |
sort -R "${src_segments}" > "${src_segments}.unsorted" | |
head -n "${num_first}" "${src_segments}.unsorted" | sort > "${first_dist_dir}/segments" | |
tail -n "${num_second}" "${src_segments}.unsorted" | sort > "${second_dist_dir}/segments" | |
rm "${src_segments}.unsorted" | |
else | |
head -n "${num_first}" "${src_segments}" | sort > "${first_dist_dir}/segments" | |
tail -n "${num_second}" "${src_segments}" | sort > "${second_dist_dir}/segments" | |
fi | |
# split wav.scp | |
rm -rf "${first_dist_dir}/wav.scp" | |
awk '{print $2}' < "${first_dist_dir}/segments" | sort | uniq | while read -r wav_id; do | |
grep "^${wav_id} " < "${src_scp}" >> "${first_dist_dir}/wav.scp" | |
done | |
rm -rf "${second_dist_dir}/wav.scp" | |
awk '{print $2}' < "${second_dist_dir}/segments" | sort | uniq | while read -r wav_id; do | |
grep "^${wav_id} " < "${src_scp}" >> "${second_dist_dir}/wav.scp" | |
done | |
fi | |
echo "Successfully split data directory." | |