File size: 5,972 Bytes
2b7bf83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/bin/bash

# Copyright 2020 Tomoki Hayashi
#  MIT License (https://opensource.org/licenses/MIT)

# Prepare kaldi-style data directory for JSSS corpus

fs=24000
num_dev=50
num_eval=50
train_set="train_nodev"
dev_set="dev"
eval_set="eval"
shuffle=false

# shellcheck disable=SC1091
. utils/parse_options.sh || exit 1;

db=$1
data_dir_root=$2

# check arguments
if [ $# != 2 ]; then
    echo "Usage: $0 [Options] <db> <data_dir>"
    echo "e.g.: $0 downloads/jsss_ver1 data"
    echo ""
    echo "Options:"
    echo "    --fs: target sampling rate (default=24000)."
    echo "    --num_dev: number of development uttreances (default=50)."
    echo "    --num_eval: number of evaluation uttreances (default=50)."
    echo "    --train_set: name of train set (default=train_nodev)."
    echo "    --dev_set: name of dev set (default=dev)."
    echo "    --eval_set: name of eval set (default=eval)."
    echo "    --shuffle: whether to perform shuffle in making dev / eval set (default=false)."
    exit 1
fi

set -euo pipefail

######################################
#    process data without segments   #
######################################
dsets_without_segments="
short-form/basic5000
short-form/onomatopee300
short-form/voiceactress100
simplification
"
for dset in ${dsets_without_segments}; do
    # check directory existence
    _data_dir=${data_dir_root}/$(basename "${dset}")
    [ ! -e "${_data_dir}" ] && mkdir -p "${_data_dir}"

    # set filenames
    scp=${_data_dir}/wav.scp
    segments=${_data_dir}/segments

    # check file existence
    [ -e "${scp}" ] && rm "${scp}"
    [ -e "${segments}" ] && rm "${segments}"

    # make wav.scp and segments
    find "${db}/${dset}/wav24kHz16bit" -name "*.wav" | sort | while read -r filename; do
        utt_id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
        lab_filename="${db}/${dset}/lab/$(basename "${filename}" .wav).lab"
        if [ ! -e "${lab_filename}" ]; then
            echo "${lab_filename} does not exist. Skipped."
            continue
        fi
        start_sec=$(head -n 1 "${lab_filename}" | cut -d " " -f 2)
        end_sec=$(tail -n 1 "${lab_filename}" | cut -d " " -f 1)
        echo "${utt_id} ${utt_id} ${start_sec} ${end_sec}" >> "${segments}"
        if [ "${fs}" -eq 24000 ]; then
            # default sampling rate
            echo "${utt_id} ${filename}" >> "${scp}"
        else
            echo "${utt_id} sox ${filename} -t wav -r $fs - |" >> "${scp}"
        fi
    done
    echo "Successfully prepared ${dset}."
done

######################################
#     process data with segments     #
######################################
dsets_with_segments="
long-form/katsura-masakazu
long-form/udon
long-form/washington-dc
summarization
"
for dset in ${dsets_with_segments}; do
    # check directory existence
    _data_dir=${data_dir_root}/$(basename "${dset}")
    [ ! -e "${_data_dir}" ] && mkdir -p "${_data_dir}"

    # set filenames
    scp=${_data_dir}/wav.scp
    segments=${_data_dir}/segments

    # check file existence
    [ -e "${scp}" ] && rm "${scp}"
    [ -e "${segments}" ] && rm "${segments}"

    # make wav.scp
    find "${db}/${dset}/wav24kHz16bit" -name "*.wav" | sort | while read -r filename; do
        wav_id=$(basename "${filename}" | sed -e "s/\.[^\.]*$//g")
        if [ "${fs}" -eq 24000 ]; then
            # default sampling rate
            echo "${wav_id} ${filename}" >> "${scp}"
        else
            echo "${wav_id} sox ${filename} -t wav -r $fs - |" >> "${scp}"
        fi
    done

    # make segments
    find "${db}/${dset}/transcript_utf8" -name "*.txt" | sort | while read -r filename; do
        wav_id=$(basename "${filename}" .txt)
        while read -r line; do
            start_sec=$(echo "${line}" | cut -f 1)
            end_sec=$(echo "${line}" | cut -f 2)
            utt_id=${wav_id}
            utt_id+="_$(printf %010d "$(echo "${start_sec}" | tr -d "." | sed -e "s/^[0]*//g")")"
            utt_id+="_$(printf %010d "$(echo "${end_sec}" | tr -d "." | sed -e "s/^[0]*//g")")"

            # modify segment information with force alignment results
            lab_filename=${db}/${dset}/lab/${utt_id}.lab
            if [ ! -e "${lab_filename}" ]; then
                echo "${lab_filename} does not exist. Skipped."
                continue
            fi
            start_sec_offset=$(head -n 1 "${lab_filename}" | cut -d " " -f 2)
            end_sec_offset=$(tail -n 1 "${lab_filename}" | cut -d " " -f 1)
            start_sec=$(python -c "print(${start_sec} + ${start_sec_offset})")
            end_sec=$(python -c "print(${start_sec} + ${end_sec_offset} - ${start_sec_offset})")
            echo "${utt_id} ${wav_id} ${start_sec} ${end_sec}" >> "${segments}"
        done < "${filename}"
    done

    # fix
    echo "Successfully prepared ${dset}."
done

######################################
#       combine and split data       #
######################################
# combine all data
combined_data_dirs=""
for dset in ${dsets_without_segments} ${dsets_with_segments}; do
    combined_data_dirs+="${data_dir_root}/$(basename "${dset}") "
done
# shellcheck disable=SC2086
utils/combine_data.sh "${data_dir_root}/all" ${combined_data_dirs}
# shellcheck disable=SC2086
rm -rf ${combined_data_dirs}

# split
num_all=$(wc -l < "${data_dir_root}/all/segments")
num_deveval=$((num_dev + num_eval))
num_train=$((num_all - num_deveval))
utils/split_data.sh \
    --num_first "${num_deveval}" \
    --num_second "${num_train}" \
    --shuffle "${shuffle}" \
    "${data_dir_root}/all" \
    "${data_dir_root}/deveval" \
    "${data_dir_root}/${train_set}"
utils/split_data.sh \
    --num_first "${num_eval}" \
    --num_second "${num_dev}" \
    --shuffle "${shuffle}" \
    "${data_dir_root}/deveval" \
    "${data_dir_root}/${eval_set}" \
    "${data_dir_root}/${dev_set}"

# remove tmp directories
rm -rf "${data_dir_root}/all"
rm -rf "${data_dir_root}/deveval"

echo "Successfully prepared data."