sakharamg's picture
Uploading all files
158b61b
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Rico Sennrich <sennrich [AT] cl.uzh.ch>
#
# This file is part of moses. Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.
# This script creates tables that store phrase pair frequencies rather than
# probabilities.
#
# These count tables can be used for a delayed, online computation of the
# original phrase translation features.
#
# The benefit is that models can be combined quickly, with the same results
# as if we trained a model on the concatenation of all data (excepting
# differences in word alignment).
#
# Also, each model can be given a weight, which is applied to all frequencies
# of the model for the combination.
# Note: the input phrase table must have alignment information;
# it must be unsmoothed;
# additionally, the phrase table type PhraseDictionaryMultiModelCounts
# requires the lexical counts files lex.counts.e2f and lex.counts.f2e
# (obtained by using the option --write-lexical-counts in
# train-model.perl)
# The results may differ from training on the concatenation of all data due
# to differences in word alignment, and rounding errors.
from __future__ import unicode_literals
import sys
import os
import gzip
from tempfile import NamedTemporaryFile
from subprocess import Popen, PIPE
if len(sys.argv) < 3 or len(sys.argv) > 4:
sys.stderr.write(
'Usage: ' +
sys.argv[0] + " in_file out_path [prune_count]\n"
"This script will create the files out_path/count-table.gz and "
"out_path/count-table-target.gz\n")
exit()
def handle_file(filename, action, fileobj=None, mode='r'):
"""support reading either from stdin, plain file or gzipped file"""
if action == 'open':
if mode == 'r':
mode = 'rb'
if mode == 'rb' and filename != '-' and not os.path.exists(filename):
if os.path.exists(filename + '.gz'):
filename = filename + '.gz'
else:
sys.stderr.write(
"Error: unable to open file. " +
filename + " - aborting.\n")
exit()
if filename.endswith('.gz'):
fileobj = gzip.open(filename, mode)
elif filename == '-':
fileobj = sys.stdin
else:
fileobj = open(filename, mode)
return fileobj
elif action == 'close' and filename != '-':
fileobj.close()
def sort_and_uniq(infile, outfile):
cmd = ['sort', infile]
fobj = handle_file(outfile, 'open', mode='w')
sys.stderr.write(
"Executing: LC_ALL=C " +
' '.join(cmd) +
' | uniq | gzip -c > ' + outfile + '\n')
p_sort = Popen(cmd, env={'LC_ALL': 'C'}, stdout=PIPE)
p_uniq = Popen(['uniq'], stdin=p_sort.stdout, stdout=PIPE)
p_compress = Popen(['gzip', '-c'], stdin=p_uniq.stdout, stdout=fobj)
p_compress.wait()
fobj.close()
def create_count_lines(fobj, countobj, countobj_target, prune=0):
i = 0
original_pos = 0
source = b""
store_lines = set()
for line in fobj:
if not i % 100000:
sys.stderr.write('.')
i += 1
line = line.split(b' ||| ')
current_source = line[0]
scores = line[2].split()
comments = line[4].split()
fs = comments[1]
ft = comments[0]
try:
fst = comments[2]
except IndexError:
fst = str(int(round(float(scores[0]) * float(ft)))).encode()
line[2] = b' '.join([fst, ft, fs])
if prune:
if current_source != source:
write_batch(store_lines, countobj, prune)
source = current_source
store_lines = set()
original_pos = 0
store_lines.add((float(fst), original_pos, b' ||| '.join(line)))
original_pos += 1
else:
countobj.write(b' ||| '.join(line))
# Target count file.
# If you use string formatting to make this look nicer, you may break
# Python 3 compatibility.
tline = b' ||| '.join([line[1], b'X', ft]) + b' ||| |||\n'
countobj_target.write(tline)
if prune:
write_batch(store_lines, countobj, prune)
countobj.close()
countobj_target.close()
def write_batch(store_lines, outfile, prune):
top20 = sorted(store_lines, reverse=True)[:prune]
# Write in original_order.
for score, original_pos, store_line in sorted(top20, key=lambda x: x[1]):
outfile.write(store_line)
if __name__ == '__main__':
if len(sys.argv) == 4:
prune = int(sys.argv[3])
else:
prune = 0
fileobj = handle_file(sys.argv[1], 'open')
out_path = sys.argv[2]
count_table_file = gzip.open(
os.path.join(out_path, 'count-table.gz'), 'w')
count_table_target_file = os.path.join(out_path, 'count-table-target.gz')
count_table_target_file_temp = NamedTemporaryFile(delete=False)
try:
sys.stderr.write(
"Creating temporary file for unsorted target counts file: " +
count_table_target_file_temp.name + '\n')
create_count_lines(
fileobj, count_table_file, count_table_target_file_temp, prune)
count_table_target_file_temp.close()
sys.stderr.write(
"Finished writing, "
"now re-sorting and compressing target count file.\n")
sort_and_uniq(
count_table_target_file_temp. name, count_table_target_file)
os.remove(count_table_target_file_temp.name)
sys.stderr.write('Done\n')
except BaseException:
os.remove(count_table_target_file_temp.name)
raise