Spaces:
Runtime error
Runtime error
File size: 3,879 Bytes
2b7bf83 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 |
#!/bin/bash
# Split data direcoty into two data direcotries
# Copyright 2019 Tomoki Hayashi
# MIT License (https://opensource.org/licenses/MIT)
# shellcheck disable=SC1091
. ./path.sh || exit 1;
shuffle=false
num_first=0
num_second=0
# shellcheck disable=SC1091
. utils/parse_options.sh || exit 1;
if [ $# -ne 3 ]; then
echo "Usage: $0 <src_dir> <dist_dir_1> <dist_dir_2> ..."
echo "e.g.: $0 data/all data/train data/deveval"
echo ""
echo "Options:"
echo " --shuffle: Whether to perform shuffle (default=false)."
echo " --num_first: Number of utts in the first dist dir."
echo " If set to 0, it will be automatically decided (default=0)."
echo " --num_second: Number of utts in the second dist dir."
echo " If set to 0, it will be automatically decided (default=0)."
exit 1
fi
set -eu
src_dir=$1
first_dist_dir=$2
second_dist_dir=$3
src_scp=${src_dir}/wav.scp
if [ -e "${src_dir}/segments" ]; then
has_segments=true
src_segments=${src_dir}/segments
num_src_utts=$(wc -l < "${src_segments}")
else
has_segments=false
num_src_utts=$(wc -l < "${src_scp}")
fi
# check number of utts
if [ "${num_first}" -eq 0 ] && [ "${num_second}" -eq 0 ]; then
num_first=$((num_src_utts / 2 ))
num_second=$((num_src_utts - num_first))
elif [ "${num_first}" -gt 0 ] && [ "${num_second}" -eq 0 ]; then
[ "${num_src_utts}" -le "${num_first}" ] && \
echo "ERROR: num_first must be less than # utts in src. (${num_first} vs ${num_src_utts})" >&2 && \
exit 1
num_second=$((num_src_utts - num_first))
elif [ "${num_first}" -eq 0 ] && [ "${num_second}" -gt 0 ]; then
[ "${num_src_utts}" -le "${num_second}" ] && \
echo "ERROR: num_second must be less than # utts in src. (${num_second} vs ${num_src_utts})" >&2 && \
exit 1
num_first=$((num_src_utts - num_second))
elif [ "${num_first}" -gt 0 ] && [ "${num_second}" -gt 0 ]; then
[ "${num_src_utts}" -ne "$((num_first + num_second))" ] && \
echo "ERROR: num_first + num_second must be the same # utts in src. ($((num_first + num_second)) vs ${num_src_utts})" >&2 && \
exit 1
fi
# check directory existence
[ ! -e "${first_dist_dir}" ] && mkdir -p "${first_dist_dir}"
[ ! -e "${second_dist_dir}" ] && mkdir -p "${second_dist_dir}"
# split
if ! "${has_segments}"; then
if "${shuffle}"; then
sort -R "${src_scp}" > "${src_scp}.unsorted"
head -n "${num_first}" "${src_scp}.unsorted" | sort > "${first_dist_dir}/wav.scp"
tail -n "${num_second}" "${src_scp}.unsorted" | sort > "${second_dist_dir}/wav.scp"
rm "${src_scp}.unsorted"
else
head -n "${num_first}" "${src_scp}" | sort > "${first_dist_dir}/wav.scp"
tail -n "${num_second}" "${src_scp}" | sort > "${second_dist_dir}/wav.scp"
fi
else
# split segments at first
if "${shuffle}"; then
sort -R "${src_segments}" > "${src_segments}.unsorted"
head -n "${num_first}" "${src_segments}.unsorted" | sort > "${first_dist_dir}/segments"
tail -n "${num_second}" "${src_segments}.unsorted" | sort > "${second_dist_dir}/segments"
rm "${src_segments}.unsorted"
else
head -n "${num_first}" "${src_segments}" | sort > "${first_dist_dir}/segments"
tail -n "${num_second}" "${src_segments}" | sort > "${second_dist_dir}/segments"
fi
# split wav.scp
rm -rf "${first_dist_dir}/wav.scp"
awk '{print $2}' < "${first_dist_dir}/segments" | sort | uniq | while read -r wav_id; do
grep "^${wav_id} " < "${src_scp}" >> "${first_dist_dir}/wav.scp"
done
rm -rf "${second_dist_dir}/wav.scp"
awk '{print $2}' < "${second_dist_dir}/segments" | sort | uniq | while read -r wav_id; do
grep "^${wav_id} " < "${src_scp}" >> "${second_dist_dir}/wav.scp"
done
fi
echo "Successfully split data directory."
|