File size: 3,879 Bytes
2b7bf83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
#!/bin/bash

# Split data direcoty into two data direcotries

# Copyright 2019 Tomoki Hayashi
#  MIT License (https://opensource.org/licenses/MIT)

# shellcheck disable=SC1091
. ./path.sh || exit 1;

shuffle=false
num_first=0
num_second=0

# shellcheck disable=SC1091
. utils/parse_options.sh || exit 1;

if [ $# -ne 3 ]; then
    echo "Usage: $0 <src_dir> <dist_dir_1> <dist_dir_2> ..."
    echo "e.g.: $0 data/all data/train data/deveval"
    echo ""
    echo "Options:"
    echo "    --shuffle: Whether to perform shuffle (default=false)."
    echo "    --num_first: Number of utts in the first dist dir."
    echo "        If set to 0, it will be automatically decided (default=0)."
    echo "    --num_second: Number of utts in the second dist dir."
    echo "        If set to 0, it will be automatically decided (default=0)."
    exit 1
fi

set -eu

src_dir=$1
first_dist_dir=$2
second_dist_dir=$3

src_scp=${src_dir}/wav.scp
if [ -e "${src_dir}/segments" ]; then
    has_segments=true
    src_segments=${src_dir}/segments
    num_src_utts=$(wc -l < "${src_segments}")
else
    has_segments=false
    num_src_utts=$(wc -l < "${src_scp}")
fi

# check number of utts
if [ "${num_first}" -eq 0 ] && [ "${num_second}" -eq 0 ]; then
    num_first=$((num_src_utts / 2 ))
    num_second=$((num_src_utts - num_first))
elif [ "${num_first}" -gt 0 ] && [ "${num_second}" -eq 0 ]; then
    [ "${num_src_utts}" -le "${num_first}" ] && \
        echo "ERROR: num_first must be less than # utts in src. (${num_first} vs ${num_src_utts})" >&2 && \
        exit 1
    num_second=$((num_src_utts - num_first))
elif [ "${num_first}" -eq 0 ] && [ "${num_second}" -gt 0 ]; then
    [ "${num_src_utts}" -le "${num_second}" ] && \
        echo "ERROR: num_second must be less than # utts in src. (${num_second} vs ${num_src_utts})" >&2 && \
        exit 1
    num_first=$((num_src_utts - num_second))
elif [ "${num_first}" -gt 0 ] && [ "${num_second}" -gt 0 ]; then
    [ "${num_src_utts}" -ne "$((num_first + num_second))" ] && \
        echo "ERROR: num_first + num_second must be the same # utts in src. ($((num_first + num_second)) vs ${num_src_utts})" >&2 && \
        exit 1
fi

# check directory existence
[ ! -e "${first_dist_dir}" ] && mkdir -p "${first_dist_dir}"
[ ! -e "${second_dist_dir}" ] && mkdir -p "${second_dist_dir}"

# split
if ! "${has_segments}"; then
    if "${shuffle}"; then
        sort -R "${src_scp}" > "${src_scp}.unsorted"
        head -n "${num_first}" "${src_scp}.unsorted" | sort > "${first_dist_dir}/wav.scp"
        tail -n "${num_second}" "${src_scp}.unsorted" | sort > "${second_dist_dir}/wav.scp"
        rm "${src_scp}.unsorted"
    else
        head -n "${num_first}" "${src_scp}" | sort > "${first_dist_dir}/wav.scp"
        tail -n "${num_second}" "${src_scp}" | sort > "${second_dist_dir}/wav.scp"
    fi
else
    # split segments at first
    if "${shuffle}"; then
        sort -R "${src_segments}" > "${src_segments}.unsorted"
        head -n "${num_first}" "${src_segments}.unsorted" | sort > "${first_dist_dir}/segments"
        tail -n "${num_second}" "${src_segments}.unsorted" | sort > "${second_dist_dir}/segments"
        rm "${src_segments}.unsorted"
    else
        head -n "${num_first}" "${src_segments}" | sort > "${first_dist_dir}/segments"
        tail -n "${num_second}" "${src_segments}" | sort > "${second_dist_dir}/segments"
    fi
    # split wav.scp
    rm -rf "${first_dist_dir}/wav.scp"
    awk '{print $2}' < "${first_dist_dir}/segments" | sort | uniq | while read -r wav_id; do
        grep "^${wav_id} " < "${src_scp}" >> "${first_dist_dir}/wav.scp"
    done
    rm -rf "${second_dist_dir}/wav.scp"
    awk '{print $2}' < "${second_dist_dir}/segments" | sort | uniq | while read -r wav_id; do
        grep "^${wav_id} " < "${src_scp}" >> "${second_dist_dir}/wav.scp"
    done
fi

echo "Successfully split data directory."