| | ################################################ |
| | ### CONFIGURATION FILE FOR AN SMT EXPERIMENT ### |
| | ################################################ |
| |
|
| | [GENERAL] |
| |
|
| | ### directory in which experiment is run |
| | # |
| | working-dir = /home/pkoehn/experiment |
| |
|
| | # specification of the language pair |
| | input-extension = fr |
| | output-extension = en |
| | pair-extension = fr-en |
| |
|
| | ### directories that contain tools and data |
| | # |
| | # moses |
| | moses-src-dir = /home/pkoehn/moses |
| | # |
| | # moses binaries |
| | moses-bin-dir = $moses-src-dir/bin |
| | # |
| | # moses scripts |
| | moses-script-dir = $moses-src-dir/scripts |
| | # |
| | # directory where GIZA++/MGIZA programs resides |
| | external-bin-dir = /Users/hieuhoang/workspace/bin/training-tools |
| | # |
| | # srilm |
| | srilm-dir = $moses-src-dir/srilm/bin/i686 |
| | # |
| | # irstlm |
| | irstlm-dir = $moses-src-dir/irstlm/bin |
| | # |
| | # randlm |
| | randlm-dir = $moses-src-dir/randlm/bin |
| | # |
| | # data |
| | toy-data = $moses-script-dir/ems/example/data |
| |
|
| | ### basic tools |
| | # |
| | # moses decoder |
| | decoder = $moses-bin-dir/moses |
| |
|
| | # conversion of rule table into binary on-disk format |
| | ttable-binarizer = "$moses-bin-dir/CreateOnDiskPt 1 1 4 100 2" |
| |
|
| | # tokenizers - comment out if all your data is already tokenized |
| | input-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $input-extension" |
| | output-tokenizer = "$moses-script-dir/tokenizer/tokenizer.perl -a -l $output-extension" |
| |
|
| | # For Arabic tokenizer try Farasa (download: http: |
| | # Abdelali, Darwish, Durrani, Mubarak (NAACL demo 2016) |
| | # "Farasa: A Fast and Furious Segmenter for Arabic" |
| | #input-tokenizer = "$farasa-dir/farasa_moses.sh" |
| |
|
| | # truecasers - comment out if you do not use the truecaser |
| | input-truecaser = $moses-script-dir/recaser/truecase.perl |
| | output-truecaser = $moses-script-dir/recaser/truecase.perl |
| | detruecaser = $moses-script-dir/recaser/detruecase.perl |
| |
|
| | # lowercaser - comment out if you use truecasing |
| | #input-lowercaser = $moses-script-dir/tokenizer/lowercase.perl |
| | #output-lowercaser = $moses-script-dir/tokenizer/lowercase.perl |
| |
|
| | ### generic parallelizer for cluster and multi-core machines |
| | # you may specify a script that allows the parallel execution |
| | # parallizable steps (see meta file). you also need specify |
| | # the number of jobs (cluster) or cores (multicore) |
| | # |
| | #generic-parallelizer = $moses-script-dir/ems/support/generic-parallelizer.perl |
| | #generic-parallelizer = $moses-script-dir/ems/support/generic-multicore-parallelizer.perl |
| |
|
| | ### cluster settings (if run on a cluster machine) |
| | # number of jobs to be submitted in parallel |
| | # |
| | #jobs = 10 |
| |
|
| | # arguments to qsub when scheduling a job |
| | #qsub-settings = "" |
| |
|
| | # project for priviledges and usage accounting |
| | #qsub-project = iccs_smt |
| |
|
| | # memory and time |
| | #qsub-memory = 4 |
| | #qsub-hours = 48 |
| |
|
| | ### multi-core settings |
| | # when the generic parallelizer is used, the number of cores |
| | # specified here |
| | cores = 4 |
| |
|
| | ################################################################# |
| | # PARALLEL CORPUS PREPARATION: |
| | # create a tokenized, sentence-aligned corpus, ready for training |
| |
|
| | [CORPUS] |
| |
|
| | ### long sentences are filtered out, since they slow down GIZA++ |
| | # and are a less reliable source of data. set here the maximum |
| | # length of a sentence |
| | # |
| | max-sentence-length = 80 |
| |
|
| | [CORPUS:toy] |
| |
|
| | ### command to run to get raw corpus files |
| | # |
| | # get-corpus-script = |
| |
|
| | ### raw corpus files (untokenized, but sentence aligned) |
| | # |
| | raw-stem = $toy-data/nc-5k |
| |
|
| | ### tokenized corpus files (may contain long sentences) |
| | # |
| | #tokenized-stem = |
| |
|
| | ### if sentence filtering should be skipped, |
| | # point to the clean training data |
| | # |
| | #clean-stem = |
| |
|
| | ### if corpus preparation should be skipped, |
| | # point to the prepared training data |
| | # |
| | #lowercased-stem = |
| |
|
| | ################################################################# |
| | # LANGUAGE MODEL TRAINING |
| |
|
| | [LM] |
| |
|
| | ### tool to be used for language model training |
| | # kenlm training |
| | lm-training = "$moses-script-dir/ems/support/lmplz-wrapper.perl -bin $moses-bin-dir/lmplz" |
| | settings = "--prune '0 0 1' -T $working-dir/lm -S 20%" |
| |
|
| | # srilm |
| | #lm-training = $srilm-dir/ngram-count |
| | #settings = "-interpolate -kndiscount -unk" |
| |
|
| | # irstlm training |
| | # msb = modified kneser ney; p=0 no singleton pruning |
| | #lm-training = "$moses-script-dir/generic/trainlm-irst2.perl -cores $cores -irst-dir $irstlm-dir -temp-dir $working-dir/tmp" |
| | #settings = "-s msb -p 0" |
| |
|
| | # order of the language model |
| | order = 5 |
| |
|
| | ### tool to be used for training randomized language model from scratch |
| | # (more commonly, a SRILM is trained) |
| | # |
| | #rlm-training = "$randlm-dir/buildlm -falsepos 8 -values 8" |
| |
|
| | ### script to use for binary table format for irstlm or kenlm |
| | # (default: no binarization) |
| |
|
| | # irstlm |
| | #lm-binarizer = $irstlm-dir/compile-lm |
| |
|
| | # kenlm, also set type to 8 |
| | lm-binarizer = $moses-bin-dir/build_binary |
| | type = 8 |
| |
|
| | ### script to create quantized language model format (irstlm) |
| | # (default: no quantization) |
| | # |
| | #lm-quantizer = $irstlm-dir/quantize-lm |
| |
|
| | ### script to use for converting into randomized table format |
| | # (default: no randomization) |
| | # |
| | #lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8" |
| |
|
| | ### each language model to be used has its own section here |
| |
|
| | [LM:toy] |
| |
|
| | ### command to run to get raw corpus files |
| | # |
| | #get-corpus-script = "" |
| |
|
| | ### raw corpus (untokenized) |
| | # |
| | raw-corpus = $toy-data/nc-5k.$output-extension |
| |
|
| | ### tokenized corpus files (may contain long sentences) |
| | # |
| | #tokenized-corpus = |
| |
|
| | ### if corpus preparation should be skipped, |
| | # point to the prepared language model |
| | # |
| | #lm = |
| |
|
| | ################################################################# |
| | # INTERPOLATING LANGUAGE MODELS |
| |
|
| | [INTERPOLATED-LM] |
| |
|
| | # if multiple language models are used, these may be combined |
| | # by optimizing perplexity on a tuning set |
| | # see, for instance [Koehn and Schwenk, IJCNLP 2008] |
| |
|
| | ### script to interpolate language models |
| | # if commented out, no interpolation is performed |
| | # |
| | # script = $moses-script-dir/ems/support/interpolate-lm.perl |
| |
|
| | ### tuning set |
| | # you may use the same set that is used for mert tuning (reference set) |
| | # |
| | #tuning-sgm = |
| | #raw-tuning = |
| | #tokenized-tuning = |
| | #factored-tuning = |
| | #lowercased-tuning = |
| | #split-tuning = |
| |
|
| | ### group language models for hierarchical interpolation |
| | # (flat interpolation is limited to 10 language models) |
| | #group = "first,second fourth,fifth" |
| |
|
| | ### script to use for binary table format for irstlm or kenlm |
| | # (default: no binarization) |
| |
|
| | # irstlm |
| | #lm-binarizer = $irstlm-dir/compile-lm |
| |
|
| | # kenlm, also set type to 8 |
| | lm-binarizer = $moses-bin-dir/build_binary |
| | type = 8 |
| |
|
| | ### script to create quantized language model format (irstlm) |
| | # (default: no quantization) |
| | # |
| | #lm-quantizer = $irstlm-dir/quantize-lm |
| |
|
| | ### script to use for converting into randomized table format |
| | # (default: no randomization) |
| | # |
| | #lm-randomizer = "$randlm-dir/buildlm -falsepos 8 -values 8" |
| |
|
| | ################################################################# |
| | # MODIFIED MOORE LEWIS FILTERING |
| |
|
| | [MML] IGNORE |
| |
|
| | ### specifications for language models to be trained |
| | # |
| | #lm-training = $srilm-dir/ngram-count |
| | #lm-settings = "-interpolate -kndiscount -unk" |
| | #lm-binarizer = $moses-src-dir/bin/build_binary |
| | #lm-query = $moses-src-dir/bin/query |
| | #order = 5 |
| |
|
| | ### in-/out-of-domain source/target corpora to train the 4 language model |
| | # |
| | # in-domain: point either to a parallel corpus |
| | #outdomain-stem = [CORPUS:toy:clean-split-stem] |
| |
|
| | # ... or to two separate monolingual corpora |
| | #indomain-target = [LM:toy:lowercased-corpus] |
| | #raw-indomain-source = $toy-data/nc-5k.$input-extension |
| |
|
| | # point to out-of-domain parallel corpus |
| | #outdomain-stem = [CORPUS:giga:clean-split-stem] |
| |
|
| | # settings: number of lines sampled from the corpora to train each language model on |
| | # (if used at all, should be small as a percentage of corpus) |
| | #settings = "--line-count 100000" |
| |
|
| | ################################################################# |
| | # TRANSLATION MODEL TRAINING |
| |
|
| | [TRAINING] |
| |
|
| | ### training script to be used: either a legacy script or |
| | # current moses training script (default) |
| | # |
| | script = $moses-script-dir/training/train-model.perl |
| |
|
| | ### general options |
| | # these are options that are passed on to train-model.perl, for instance |
| | # * "-mgiza -mgiza-cpus 8" to use mgiza instead of giza |
| | # * "-sort-buffer-size 8G -sort-compress gzip" to reduce on-disk sorting |
| | # * "-sort-parallel 8 -cores 8" to speed up phrase table building |
| | # * "-parallel" for parallel execution of mkcls and giza |
| | # |
| | #training-options = "" |
| |
|
| | ### factored training: specify here which factors used |
| | # if none specified, single factor training is assumed |
| | # (one translation step, surface to surface) |
| | # |
| | #input-factors = word lemma pos morph |
| | #output-factors = word lemma pos |
| | #alignment-factors = "word -> word" |
| | #translation-factors = "word -> word" |
| | #reordering-factors = "word -> word" |
| | #generation-factors = "word -> pos" |
| | #decoding-steps = "t0, g0" |
| |
|
| | ### parallelization of data preparation step |
| | # the two directions of the data preparation can be run in parallel |
| | # comment out if not needed |
| | # |
| | parallel = yes |
| |
|
| | ### pre-computation for giza++ |
| | # giza++ has a more efficient data structure that needs to be |
| | # initialized with snt2cooc. if run in parallel, this may reduces |
| | # memory requirements. set here the number of parts |
| | # |
| | #run-giza-in-parts = 5 |
| |
|
| | ### symmetrization method to obtain word alignments from giza output |
| | # (commonly used: grow-diag-final-and) |
| | # |
| | alignment-symmetrization-method = grow-diag-final-and |
| |
|
| | ### use of Chris Dyer's fast align for word alignment |
| | # |
| | #fast-align-settings = "-d -o -v" |
| | |
| | ### use of berkeley aligner for word alignment |
| | # |
| | #use-berkeley = true |
| | #alignment-symmetrization-method = berkeley |
| | #berkeley-train = $moses-script-dir/ems/support/berkeley-train.sh |
| | #berkeley-process = $moses-script-dir/ems/support/berkeley-process.sh |
| | #berkeley-jar = /your/path/to/berkeleyaligner-1.1/berkeleyaligner.jar |
| | #berkeley-java-options = "-server -mx30000m -ea" |
| | #berkeley-training-options = "-Main.iters 5 5 -EMWordAligner.numThreads 8" |
| | #berkeley-process-options = "-EMWordAligner.numThreads 8" |
| | #berkeley-posterior = 0.5 |
| | |
| | ### use of baseline alignment model (incremental training) |
| | # |
| | #baseline = 68 |
| | #baseline-alignment-model = "$working-dir/training/prepared.$baseline/$input-extension.vcb \ |
| | # $working-dir/training/prepared.$baseline/$output-extension.vcb \ |
| | # $working-dir/training/giza.$baseline/${output-extension}-$input-extension.cooc \ |
| | # $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.cooc \ |
| | # $working-dir/training/giza.$baseline/${output-extension}-$input-extension.thmm.5 \ |
| | # $working-dir/training/giza.$baseline/${output-extension}-$input-extension.hhmm.5 \ |
| | # $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.thmm.5 \ |
| | # $working-dir/training/giza-inverse.$baseline/${input-extension}-$output-extension.hhmm.5" |
| | |
| | ### if word alignment should be skipped, |
| | # point to word alignment files |
| | # |
| | #word-alignment = $working-dir/model/aligned.1 |
| | |
| | ### filtering some corpora with modified Moore-Lewis |
| | # specify corpora to be filtered and ratio to be kept, either before or after word alignment |
| | #mml-filter-corpora = toy |
| | #mml-before-wa = "-proportion 0.9" |
| | #mml-after-wa = "-proportion 0.9" |
| | |
| | ### build memory mapped suffix array phrase table |
| | # (binarizing the reordering table is a good idea, since filtering makes little sense) |
| | #mmsapt = "num-features=9 pfwd=g+ pbwd=g+ smooth=0 sample=1000 workers=1" |
| | #binarize-all = $moses-script-dir/training/binarize-model.perl |
| | |
| | ### create a bilingual concordancer for the model |
| | # |
| | #biconcor = $moses-bin-dir/biconcor |
| | |
| | ## Operation Sequence Model (OSM) |
| | # Durrani, Schmid and Fraser. (2011): |
| | # "A Joint Sequence Translation Model with Integrated Reordering" |
| | # compile Moses with --max-kenlm-order=9 if higher order is required |
| | # |
| | #operation-sequence-model = "yes" |
| | #operation-sequence-model-order = 5 |
| | #operation-sequence-model-settings = "-lmplz '$moses-src-dir/bin/lmplz -S 40%'" |
| | # |
| | # OR if you want to use with SRILM |
| | # |
| | #operation-sequence-model-settings = "--srilm-dir /path-to-srilm/bin/i686-m64" |
| | |
| | ## Class-based Operation Sequence Model (OSM) |
| | # if OSM has to be enabled with factors then add factors as below. |
| | # Durrani, Koehn, Schmid, Fraser (COLING, 2014). |
| | #Investigating the Usefulness of Generalized Word Representations in SMT |
| | # |
| | #operation-sequence-model-settings = "--factor 0-0+1-1" |
| | |
| | ## Interpolated Operation Sequence Model (OSM) |
| | # if OSM has to be enabled with factors then add factors as below. |
| | # Durrani, Sajjad, Joty, Abdelali and Vogel (Mt Summit, 2015). |
| | # Using Joint Models for Domain Adaptation in Statistical Machine Translation |
| | # |
| | #interpolated-operation-sequence-model = "yes" |
| | #operation-sequence-model-order = 5 |
| | #operation-sequence-model-settings = "--srilm-dir /path-to-srilm/bin/i686-m64 --tune /path-to-tune-folder/tune_file" |
| | #Interpolated OSM can only be used with SRILM because of the interpolation script |
| | |
| | |
| | # if OSM training should be skipped, point to OSM Model |
| | #osm-model = |
| | |
| | ### unsupervised transliteration module |
| | # Durrani, Sajjad, Hoang and Koehn (EACL, 2014). |
| | # "Integrating an Unsupervised Transliteration Model |
| | # into Statistical Machine Translation." |
| | # |
| | #transliteration-module = "yes" |
| | #post-decoding-transliteration = "yes" |
| | |
| | ### lexicalized reordering: specify orientation type |
| | # (default: only distance-based reordering model) |
| | # |
| | lexicalized-reordering = msd-bidirectional-fe |
| | |
| | ### hierarchical rule set |
| | # |
| | #hierarchical-rule-set = true |
| | |
| | ### settings for rule extraction |
| | # |
| | #extract-settings = "" |
| | max-phrase-length = 5 |
| | |
| | ### add extracted phrases from baseline model |
| | # |
| | #baseline-extract = $working-dir/model/extract.$baseline |
| | # |
| | # requires aligned parallel corpus for re-estimating lexical translation probabilities |
| | #baseline-corpus = $working-dir/training/corpus.$baseline |
| | #baseline-alignment = $working-dir/model/aligned.$baseline.$alignment-symmetrization-method |
| | |
| | ### unknown word labels (target syntax only) |
| | # enables use of unknown word labels during decoding |
| | # label file is generated during rule extraction |
| | # |
| | #use-unknown-word-labels = true |
| | |
| | ### if phrase extraction should be skipped, |
| | # point to stem for extract files |
| | # |
| | # extracted-phrases = |
| | |
| | ### settings for rule scoring |
| | # |
| | score-settings = "--GoodTuring --MinScore 2:0.0001" |
| | |
| | ### include word alignment in phrase table |
| | # |
| | #include-word-alignment-in-rules = yes |
| | |
| | ### sparse lexical features |
| | # |
| | #sparse-features = "target-word-insertion top 50, source-word-deletion top 50, word-translation top 50 50, phrase-length" |
| | |
| | ### domain adaptation settings |
| | # options: sparse, any of: indicator, subset, ratio |
| | #domain-features = "subset" |
| | |
| | ### if phrase table training should be skipped, |
| | # point to phrase translation table |
| | # |
| | # phrase-translation-table = |
| | |
| | ### if reordering table training should be skipped, |
| | # point to reordering table |
| | # |
| | # reordering-table = |
| | |
| | ### filtering the phrase table based on significance tests |
| | # Johnson, Martin, Foster and Kuhn. (2007): "Improving Translation Quality by Discarding Most of the Phrasetable" |
| | # options: -n number of translations; -l 'a+e', 'a-e', or a positive real value -log prob threshold |
| | #salm-index = /path/to/project/salm/Bin/Linux/Index/IndexSA.O64 |
| | #sigtest-filter = "-l a+e -n 50" |
| | |
| | ### if training should be skipped, |
| | # point to a configuration file that contains |
| | # pointers to all relevant model files |
| | # |
| | #config-with-reused-weights = |
| | |
| | ##################################################### |
| | ### TUNING: finding good weights for model components |
| | |
| | [TUNING] |
| | |
| | ### instead of tuning with this setting, old weights may be recycled |
| | # specify here an old configuration file with matching weights |
| | # |
| | weight-config = $toy-data/weight.ini |
| | |
| | ### tuning script to be used |
| | # |
| | tuning-script = $moses-script-dir/training/mert-moses.pl |
| | tuning-settings = "-mertdir $moses-bin-dir" |
| | |
| | ### specify the corpus used for tuning |
| | # it should contain 1000s of sentences |
| | # |
| | #input-sgm = |
| | #raw-input = |
| | #tokenized-input = |
| | #factorized-input = |
| | #input = |
| | # |
| | #reference-sgm = |
| | #raw-reference = |
| | #tokenized-reference = |
| | #factorized-reference = |
| | #reference = |
| | |
| | ### size of n-best list used (typically 100) |
| | # |
| | nbest = 100 |
| | |
| | ### ranges for weights for random initialization |
| | # if not specified, the tuning script will use generic ranges |
| | # it is not clear, if this matters |
| | # |
| | # lambda = |
| | |
| | ### additional flags for the filter script |
| | # |
| | filter-settings = "" |
| | |
| | ### additional flags for the decoder |
| | # |
| | decoder-settings = "-threads $cores" |
| | |
| | ### if tuning should be skipped, specify this here |
| | # and also point to a configuration file that contains |
| | # pointers to all relevant model files |
| | # |
| | #config-with-reused-weights = |
| | |
| | ######################################################### |
| | ## RECASER: restore case, this part only trains the model |
| | |
| | [RECASING] IGNORE |
| | |
| | ### training data |
| | # raw input needs to be still tokenized, |
| | # also also tokenized input may be specified |
| | # |
| | #tokenized = [LM:europarl:tokenized-corpus] |
| | |
| | ### additinal settings |
| | # |
| | recasing-settings = "" |
| | #lm-training = $srilm-dir/ngram-count |
| | decoder = $moses-bin-dir/moses |
| | |
| | # already a trained recaser? point to config file |
| | #recase-config = |
| | |
| | ####################################################### |
| | ## TRUECASER: train model to truecase corpora and input |
| | |
| | [TRUECASER] |
| | |
| | ### script to train truecaser models |
| | # |
| | trainer = $moses-script-dir/recaser/train-truecaser.perl |
| | |
| | ### training data |
| | # data on which truecaser is trained |
| | # if no training data is specified, parallel corpus is used |
| | # |
| | # raw-stem = |
| | # tokenized-stem = |
| | |
| | ### trained model |
| | # |
| | # truecase-model = |
| | |
| | ###################################################################### |
| | ## EVALUATION: translating a test set using the tuned system and score it |
| | |
| | [EVALUATION] |
| | |
| | ### additional flags for the filter script |
| | # |
| | #filter-settings = "" |
| | |
| | ### additional decoder settings |
| | # switches for the Moses decoder |
| | # common choices: |
| | # "-threads N" for multi-threading |
| | # "-mbr" for MBR decoding |
| | # "-drop-unknown" for dropping unknown source words |
| | # "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000" for cube pruning |
| | # |
| | decoder-settings = "-search-algorithm 1 -cube-pruning-pop-limit 5000 -s 5000 -threads $cores" |
| | |
| | ### specify size of n-best list, if produced |
| | # |
| | #nbest = 100 |
| | |
| | ### multiple reference translations |
| | # |
| | #multiref = yes |
| | |
| | ### prepare system output for scoring |
| | # this may include detokenization and wrapping output in sgm |
| | # (needed for nist-bleu, ter, meteor) |
| | # |
| | detokenizer = "$moses-script-dir/tokenizer/detokenizer.perl -l $output-extension" |
| | #recaser = $moses-script-dir/recaser/recase.perl |
| | wrapping-script = "$moses-script-dir/ems/support/wrap-xml.perl $output-extension" |
| | #output-sgm = |
| | |
| | ### BLEU |
| | # |
| | nist-bleu = $moses-script-dir/generic/mteval-v13a.pl |
| | nist-bleu-c = "$moses-script-dir/generic/mteval-v13a.pl -c" |
| | #multi-bleu = "$moses-script-dir/generic/multi-bleu.perl -lc" |
| | #multi-bleu-c = $moses-script-dir/generic/multi-bleu.perl |
| | #ibm-bleu = |
| | #sacre-bleu = "sacrebleu -lc" |
| | #sacre-bleu-c = "sacrebleu" |
| | |
| | ### TER: translation error rate (BBN metric) based on edit distance |
| | # not yet integrated |
| | # |
| | # ter = |
| | |
| | ### METEOR: gives credit to stem / worknet synonym matches |
| | # not yet integrated |
| | # |
| | # meteor = |
| | |
| | ### Analysis: carry out various forms of analysis on the output |
| | # |
| | analysis = $moses-script-dir/ems/support/analysis.perl |
| | # |
| | # also report on input coverage |
| | analyze-coverage = yes |
| | # |
| | # also report on phrase mappings used |
| | report-segmentation = yes |
| | # |
| | # report precision of translations for each input word, broken down by |
| | # count of input word in corpus and model |
| | #report-precision-by-coverage = yes |
| | # |
| | # further precision breakdown by factor |
| | #precision-by-coverage-factor = pos |
| | # |
| | # visualization of the search graph in tree-based models |
| | #analyze-search-graph = yes |
| | |
| | [EVALUATION:test] |
| | |
| | ### input data |
| | # |
| | input-sgm = $toy-data/test-src.$input-extension.sgm |
| | # raw-input = |
| | # tokenized-input = |
| | # factorized-input = |
| | # input = |
| | |
| | ### reference data |
| | # |
| | reference-sgm = $toy-data/test-ref.$output-extension.sgm |
| | # raw-reference = |
| | # tokenized-reference = |
| | # reference = |
| | |
| | ### analysis settings |
| | # may contain any of the general evaluation analysis settings |
| | # specific setting: base coverage statistics on earlier run |
| | # |
| | #precision-by-coverage-base = $working-dir/evaluation/test.analysis.5 |
| | |
| | ### wrapping frame |
| | # for nist-bleu and other scoring scripts, the output needs to be wrapped |
| | # in sgm markup (typically like the input sgm) |
| | # |
| | wrapping-frame = $input-sgm |
| | |
| | ########################################## |
| | ### REPORTING: summarize evaluation scores |
| | |
| | [REPORTING] |
| | |
| | ### currently no parameters for reporting section |
| | |
| | |