Spaces:
No application file
No application file
File size: 1,294 Bytes
d08dd00 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 |
#!/bin/bash
if [ $# != 2 ]; then
echo "USAGE: ./gen_mtxt.sh <data dir>";
exit
fi
declare -a langs=("as" "or" "kn" "ml" "ta" "te" "gu" "mr" "en" "hi" "pa" "bn")
DATA_DIR="$1"
# Generate train small file
OUTPUT="$DATA_DIR/train_small.txt"
if [ -f "$OUTPUT" ]; then
echo "Output file already exists. Please remove it first"
exit
fi
for lang in ${langs[@]}; do
echo "Processing $lang"
lines=$(wc -l "$DATA_DIR/$lang.txt" | cut -d' ' -f1)
smtlines=$(echo "e(l($lines*100)*0.7)/1" | bc -l)
smtlines=${smtlines%.*}
echo "Sampling $smtlines from $lines lines";
cat "$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt"\
"$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt" | head -n "$smtlines" >> "$OUTPUT"
done
# Generate train file
OUTPUT="$DATA_DIR/train.txt"
if [ -f "$OUTPUT" ]; then
echo "Output file already exists. Please remove it first"
exit
fi
for lang in ${langs[@]}; do
echo "Processing $lang"
lines=$(wc -l "$DATA_DIR/$lang.txt" | cut -d' ' -f1)
smtlines=$(echo "e(l($lines*2100)*0.7)/1" | bc -l)
smtlines=${smtlines%.*}
echo "Sampling $smtlines from $lines lines";
cat "$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt"\
"$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt" | head -n "$smtlines" >> "$OUTPUT"
done
|