| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| #include <pocketsphinx.h> |
|
|
| #include "lm/ngram_model.h" |
| #include "util/ckd_alloc.h" |
| #include "util/cmd_ln.h" |
| #include "util/ckd_alloc.h" |
| #include "util/pio.h" |
| #include "util/strfuncs.h" |
| #include "pocketsphinx_internal.h" |
|
|
| #include <stdio.h> |
| #include <string.h> |
| #include <math.h> |
|
|
| static const ps_arg_t defn[] = { |
| { "help", |
| ARG_BOOLEAN, |
| "no", |
| "Shows the usage of the tool"}, |
|
|
| { "logbase", |
| ARG_FLOATING, |
| "1.0001", |
| "Base in which all log-likelihoods calculated" }, |
|
|
| { "lm", |
| ARG_STRING, |
| NULL, |
| "Language model file"}, |
|
|
| { "probdef", |
| ARG_STRING, |
| NULL, |
| "Probability definition file for classes in LM"}, |
|
|
| { "lmctlfn", |
| ARG_STRING, |
| NULL, |
| "Control file listing a set of language models"}, |
|
|
| { "lmname", |
| ARG_STRING, |
| NULL, |
| "Name of language model in -lmctlfn to use for all utterances" }, |
|
|
| { "lsn", |
| ARG_STRING, |
| NULL, |
| "Transcription file to evaluate"}, |
|
|
| { "text", |
| ARG_STRING, |
| NULL, |
| "Text string to evaluate"}, |
|
|
| { "mmap", |
| ARG_BOOLEAN, |
| "no", |
| "Use memory-mapped I/O for reading binary LM files"}, |
|
|
| { "lw", |
| ARG_FLOATING, |
| "1.0", |
| "Language model weight" }, |
|
|
| { "wip", |
| ARG_FLOATING, |
| "1.0", |
| "Word insertion probability" }, |
|
|
| { "verbose", |
| ARG_BOOLEAN, |
| "no", |
| "Print details of perplexity calculation" }, |
|
|
| |
| { NULL, 0, NULL, NULL } |
| }; |
|
|
| static int verbose; |
|
|
| static int |
| calc_entropy(ngram_model_t *lm, char **words, int32 n, |
| int32 *out_n_ccs, int32 *out_n_oovs, int32 *out_lm_score) |
| { |
| int32 *wids; |
| int32 startwid; |
| int32 i, ch, nccs, noovs, unk; |
|
|
| if (n == 0) |
| return 0; |
|
|
| unk = ngram_unknown_wid(lm); |
|
|
| |
| wids = ckd_calloc(n, sizeof(*wids)); |
| for (i = 0; i < n; ++i) |
| wids[n-i-1] = ngram_wid(lm, words[i]); |
| |
| startwid = ngram_wid(lm, "<s>"); |
|
|
| |
| |
| ch = noovs = nccs = 0; |
| for (i = 0; i < n; ++i) { |
| int32 n_used; |
| int32 prob; |
|
|
| |
| if (wids[i] == startwid) { |
| ++nccs; |
| continue; |
| } |
| |
| if (wids[i] == NGRAM_INVALID_WID || wids[i] == unk) { |
| ++noovs; |
| continue; |
| } |
| |
| prob = ngram_ng_score(lm, |
| wids[i], wids + i + 1, |
| n - i - 1, &n_used); |
| if (verbose) { |
| int m; |
| printf("log P(%s|", ngram_word(lm, wids[i])); |
| m = i + ngram_model_get_size(lm) - 1; |
| if (m >= n) |
| m = n - 1; |
| while (m > i) { |
| printf("%s ", ngram_word(lm, wids[m--])); |
| } |
| printf(") = %d\n", prob); |
| } |
| ch -= prob; |
| } |
|
|
| if (out_n_ccs) *out_n_ccs = nccs; |
| if (out_n_oovs) *out_n_oovs = noovs; |
|
|
| |
| n -= (nccs + noovs); |
| if (n <= 0) |
| return 0; |
| if (out_lm_score) |
| *out_lm_score = -ch; |
| return ch / n; |
| } |
|
|
| static void |
| evaluate_file(ngram_model_t *lm, logmath_t *lmath, const char *lsnfn) |
| { |
| FILE *fh; |
| lineiter_t *litor; |
| int32 nccs, noovs, nwords, lscr; |
| float64 ch, log_to_log2;; |
|
|
| if ((fh = fopen(lsnfn, "r")) == NULL) |
| E_FATAL_SYSTEM("failed to open transcript file %s", lsnfn); |
|
|
| |
| |
| log_to_log2 = log(logmath_get_base(lmath)) / log(2); |
| lscr = nccs = noovs = nwords = 0; |
| ch = 0.0; |
| for (litor = lineiter_start(fh); litor; litor = lineiter_next(litor)) { |
| char **words; |
| int32 n, tmp_ch, tmp_noovs, tmp_nccs, tmp_lscr; |
|
|
| n = str2words(litor->buf, NULL, 0); |
| if (n < 0) |
| E_FATAL("str2words(line, NULL, 0) = %d, should not happen\n", n); |
| if (n == 0) |
| continue; |
| words = ckd_calloc(n, sizeof(*words)); |
| str2words(litor->buf, words, n); |
|
|
| |
| if (words[n-1][0] == '(' |
| && words[n-1][strlen(words[n-1])-1] == ')') |
| n = n - 1; |
|
|
| tmp_ch = calc_entropy(lm, words, n, &tmp_nccs, |
| &tmp_noovs, &tmp_lscr); |
|
|
| ch += (float64) tmp_ch * (n - tmp_nccs - tmp_noovs) * log_to_log2; |
| nccs += tmp_nccs; |
| noovs += tmp_noovs; |
| lscr += tmp_lscr; |
| nwords += n; |
| |
| ckd_free(words); |
| } |
|
|
| ch /= (nwords - nccs - noovs); |
| printf("cross-entropy: %f bits\n", ch); |
|
|
| |
| printf("perplexity: %f\n", pow(2.0, ch)); |
| printf("lm score: %d\n", lscr); |
|
|
| |
| printf("%d words evaluated\n", nwords); |
| printf("%d OOVs (%.2f%%), %d context cues removed\n", |
| noovs, (double)noovs / nwords * 100, nccs); |
| } |
|
|
| static void |
| evaluate_string(ngram_model_t *lm, logmath_t *lmath, const char *text) |
| { |
| char *textfoo; |
| char **words; |
| int32 n, ch, noovs, nccs, lscr; |
|
|
| |
| textfoo = ckd_salloc(text); |
| n = str2words(textfoo, NULL, 0); |
| if (n < 0) |
| E_FATAL("str2words(textfoo, NULL, 0) = %d, should not happen\n", n); |
| if (n == 0) |
| return; |
| words = ckd_calloc(n, sizeof(*words)); |
| str2words(textfoo, words, n); |
|
|
| ch = calc_entropy(lm, words, n, &nccs, &noovs, &lscr); |
|
|
| printf("input: %s\n", text); |
| printf("cross-entropy: %f bits\n", |
| ch * log(logmath_get_base(lmath)) / log(2)); |
|
|
| |
| printf("perplexity: %f\n", logmath_exp(lmath, ch)); |
| printf("lm score: %d\n", lscr); |
|
|
| |
| printf("%d words evaluated\n", n); |
| printf("%d OOVs, %d context cues removed\n", |
| noovs, nccs); |
|
|
| ckd_free(textfoo); |
| ckd_free(words); |
| } |
|
|
| int |
| main(int argc, char *argv[]) |
| { |
| cmd_ln_t *config; |
| ngram_model_t *lm = NULL; |
| logmath_t *lmath; |
| const char *lmfn, *probdefn, *lsnfn, *text; |
|
|
| if ((config = cmd_ln_parse_r(NULL, defn, argc, argv, TRUE)) == NULL) { |
| |
| err_set_loglevel(ERR_INFO); |
| cmd_ln_log_help_r(NULL, defn); |
| return 1; |
| } |
|
|
| verbose = ps_config_bool(config, "verbose"); |
|
|
| |
| if ((lmath = logmath_init |
| (ps_config_float(config, "logbase"), 0, 0)) == NULL) { |
| E_FATAL("Failed to initialize log math\n"); |
| } |
|
|
| |
| lmfn = ps_config_str(config, "lm"); |
| if (lmfn == NULL |
| || (lm = ngram_model_read(config, lmfn, |
| NGRAM_AUTO, lmath)) == NULL) { |
| E_FATAL("Failed to load language model from %s\n", |
| ps_config_str(config, "lm")); |
| } |
| if ((probdefn = ps_config_str(config, "probdef")) != NULL) |
| ngram_model_read_classdef(lm, probdefn); |
| ngram_model_apply_weights(lm, |
| ps_config_float(config, "lw"), |
| ps_config_float(config, "wip")); |
|
|
| |
| lsnfn = ps_config_str(config, "lsn"); |
| text = ps_config_str(config, "text"); |
| if (lsnfn) { |
| evaluate_file(lm, lmath, lsnfn); |
| } |
| else if (text) { |
| evaluate_string(lm, lmath, text); |
| } |
|
|
| return 0; |
| } |
|
|