|
|
|
|
|
|
|
|
|
|
|
import argparse |
|
import os |
|
import subprocess |
|
import sys |
|
|
|
|
|
FEAT_FIELD = 2 |
|
|
|
|
|
BIN_DIR = os.path.join(os.path.dirname(os.path.dirname(os.path.dirname(__file__))), 'bin') |
|
|
|
def main(): |
|
|
|
|
|
parser = argparse.ArgumentParser(description='Learn N-best rescoring weights') |
|
parser.add_argument('--nbest', metavar='nbest', \ |
|
help='Dev set N-best list augmented with new features', required=True) |
|
parser.add_argument('--ref', metavar='ref', \ |
|
help='Dev set reference translation', required=True) |
|
parser.add_argument('--working-dir', metavar='rescore-work', \ |
|
help='Optimizer working directory', required=True) |
|
parser.add_argument('--bin-dir', metavar='DIR', \ |
|
help='Moses bin dir, containing kbmira, evaluator, etc.', default=BIN_DIR) |
|
|
|
|
|
|
|
parser.add_argument('--iterations', metavar='N', type=int, \ |
|
help='Number of K-best MIRA iterations to run (default: 300)', default=300) |
|
args = parser.parse_args() |
|
|
|
|
|
extractor = os.path.join(args.bin_dir, 'extractor') |
|
kbmira = os.path.join(args.bin_dir, 'kbmira') |
|
for exe in (extractor, kbmira): |
|
if not os.path.exists(exe): |
|
sys.stderr.write('Error: cannot find executable "{}" in "{}", please specify --bin-dir\n'.format(exe, args.bin_dir)) |
|
sys.exit(1) |
|
|
|
|
|
if not os.path.exists(args.working_dir): |
|
os.mkdir(args.working_dir) |
|
|
|
|
|
|
|
init_weights = [] |
|
fields = [f.strip() for f in open(args.nbest).readline().split('|||')] |
|
feats = fields[FEAT_FIELD].split() |
|
for i in range(len(feats)): |
|
if feats[i].endswith('='): |
|
n_weights = 0 |
|
j = i + 1 |
|
while j < len(feats): |
|
if feats[j].endswith('='): |
|
break |
|
n_weights += 1 |
|
j += 1 |
|
|
|
init_weights.append([feats[i], [0] * n_weights]) |
|
|
|
|
|
extractor_cmd = [extractor, \ |
|
'--sctype', 'BLEU', '--scconfig', 'case:true', \ |
|
'--scfile', os.path.join(args.working_dir, 'scores.dat'), \ |
|
'--ffile', os.path.join(args.working_dir, 'features.dat'), \ |
|
'-r', args.ref, \ |
|
'-n', args.nbest] |
|
subprocess.call(extractor_cmd) |
|
|
|
|
|
with open(os.path.join(args.working_dir, 'init.dense'), 'w') as out: |
|
for (feat, weights) in init_weights: |
|
for w in weights: |
|
out.write('{} {}\n'.format(feat, w)) |
|
|
|
|
|
kbmira_cmd = [kbmira, \ |
|
'--dense-init', os.path.join(args.working_dir, 'init.dense'), \ |
|
'--ffile', os.path.join(args.working_dir, 'features.dat'), \ |
|
'--scfile', os.path.join(args.working_dir, 'scores.dat'), \ |
|
'-o', os.path.join(args.working_dir, 'mert.out'), \ |
|
'--iters', str(args.iterations)] |
|
subprocess.call(kbmira_cmd) |
|
|
|
|
|
opt_weights = [] |
|
total = 0 |
|
with open(os.path.join(args.working_dir, 'mert.out')) as inp: |
|
|
|
for (feat, weights) in init_weights: |
|
opt_weights.append([feat, []]) |
|
for _ in weights: |
|
w = float(inp.readline().split()[1]) |
|
opt_weights[-1][1].append(w) |
|
|
|
total += abs(w) |
|
|
|
|
|
for (_, weights) in opt_weights: |
|
for i in range(len(weights)): |
|
weights[i] /= total |
|
|
|
|
|
with open(os.path.join(args.working_dir, 'rescore.ini'), 'w') as out: |
|
out.write('# For use with Moses N-best rescorer "scripts/nbest-rescore/rescore.py"\n') |
|
out.write('\n') |
|
out.write('[weight]\n') |
|
for (feat, weights) in opt_weights: |
|
out.write('{} {}\n'.format(feat, ' '.join(str(w) for w in weights))) |
|
|
|
if __name__ == '__main__': |
|
main() |
|
|