#!/usr/bin/env bash set -e # Print out the vocabulary from Gr.fst for all zipped models in a directory. # Assumes fstprint is in PATH and ngramfst.so is in LD_LIBRARY_PATH. if [ -z "$2" ]; then echo 'Usage: print-vocabulary ' exit 1 fi model_dir="$1" vocab_dir="$2" mkdir -p "${vocab_dir}" temp_dir="$(mktemp -d)" function finish { rm -rf "${temp_dir}" } trap finish EXIT find "${model_dir}" -name '*.zip' -type f | \ while read -r zip_file; do model_name="$(basename "${zip_file}" .zip)" vocab_file="${vocab_dir}/${model_name}.txt" if [ -s "${vocab_file}" ]; then echo "Skipping ${model_name} (${vocab_file})" continue fi model_dir="${temp_dir}/${model_name}" mkdir -p "${model_dir}" unzip -j "${zip_file}" "${model_name}/graph/Gr.fst" -d "${model_dir}" || \ unzip -j "${zip_file}" "${model_name}/Gr.fst" -d "${model_dir}" || \ unzip -j "${zip_file}" "${model_name}/words.txt" -d "${model_dir}" || \ unzip -j "${zip_file}" "${model_name}/graph/words.txt" -d "${model_dir}" || \ true if [ -f "${model_dir}/words.txt" ]; then cut -d' ' -f1 < "${model_dir}/words.txt" | sort | uniq > "${vocab_file}" elif [ -f "${model_dir}/Gr.fst" ]; then fstprint "${model_dir}/Gr.fst" | cut -f3 | sort | uniq > "${vocab_file}" else echo "ERROR: can't get vocabulary for ${model_name}" fi done