sunit333's picture
Upload 63 files
d08dd00 verified
raw
history blame contribute delete
No virus
1.29 kB
#!/bin/bash
if [ $# != 2 ]; then
echo "USAGE: ./gen_mtxt.sh <data dir>";
exit
fi
declare -a langs=("as" "or" "kn" "ml" "ta" "te" "gu" "mr" "en" "hi" "pa" "bn")
DATA_DIR="$1"
# Generate train small file
OUTPUT="$DATA_DIR/train_small.txt"
if [ -f "$OUTPUT" ]; then
echo "Output file already exists. Please remove it first"
exit
fi
for lang in ${langs[@]}; do
echo "Processing $lang"
lines=$(wc -l "$DATA_DIR/$lang.txt" | cut -d' ' -f1)
smtlines=$(echo "e(l($lines*100)*0.7)/1" | bc -l)
smtlines=${smtlines%.*}
echo "Sampling $smtlines from $lines lines";
cat "$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt"\
"$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt" | head -n "$smtlines" >> "$OUTPUT"
done
# Generate train file
OUTPUT="$DATA_DIR/train.txt"
if [ -f "$OUTPUT" ]; then
echo "Output file already exists. Please remove it first"
exit
fi
for lang in ${langs[@]}; do
echo "Processing $lang"
lines=$(wc -l "$DATA_DIR/$lang.txt" | cut -d' ' -f1)
smtlines=$(echo "e(l($lines*2100)*0.7)/1" | bc -l)
smtlines=${smtlines%.*}
echo "Sampling $smtlines from $lines lines";
cat "$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt"\
"$DATA_DIR/$lang.txt" "$DATA_DIR/$lang.txt" | head -n "$smtlines" >> "$OUTPUT"
done