| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | from argparse import ArgumentParser |
| | import math |
| | import os |
| | from random import randint |
| | import sys, gzip |
| |
|
| |
|
| | def count_ngrams(snt, max_n): |
| | """ |
| | Return a dictionary of ngram counts (up to length /max_n/) |
| | for sentence (list of words) /snt/. |
| | """ |
| | ret = {} |
| | for i in xrange(len(snt)): |
| | for k in xrange(i + 1, min(i + max_n + 1, len(snt) + 1)): |
| | key = tuple(snt[i:k]) |
| | ret[key] = ret.get(key, 0) + 1 |
| | return ret |
| |
|
| |
|
| | def max_counts(ng1, ng2): |
| | """ |
| | Return a dicitonary of ngram counts such that |
| | each count is the greater of the two individual counts |
| | for each ngram in the input ngram count dictionaries |
| | /ng1/ and /ng2/. |
| | """ |
| | ret = ng1.copy() |
| | for k, v in ng2.items(): |
| | ret[k] = max(ret.get(k, 0), v) |
| | return ret |
| |
|
| |
|
| | def ng_hits(hyp, ref, max_n): |
| | """ |
| | Return a list of ngram counts such that each ngram count |
| | is the minimum of the counts in hyp and ref, up to ngram |
| | length /max_n/. |
| | """ |
| | ret = [0 for i in xrange(max_n)] |
| | for ng, cnt in hyp.items(): |
| | k = ng |
| | if len(k) <= max_n: |
| | ret[len(k) - 1] += min(cnt, ref.get(ng, 0)) |
| | return ret |
| |
|
| |
|
| | class BleuScore: |
| | def __init__(self, hyp, ref, max_n=4, bootstrap=1000): |
| | |
| | self.hits = [ |
| | ng_hits(hyp.ngrams[i], ref.ngrams[i], max_n) |
| | for i in xrange(len(hyp.ngrams))] |
| | self.max_n = max_n |
| | self.hyp = hyp |
| | self.ref = ref |
| | self.lower = None |
| | self.upper = None |
| | self.median = None |
| | self.actual = self.score([i for i in xrange(len(hyp.snt))]) |
| | if bootstrap: |
| | self.bootstrap = [self.score([randint(0, len(hyp.snt) - 1) |
| | for s in hyp.snt]) |
| | for i in xrange(bootstrap)] |
| | self.bootstrap.sort() |
| | else: |
| | self.bootstrap = [self.actual] |
| | pass |
| |
|
| | def score(self, sample): |
| | hits = [0 for i in xrange(self.max_n)] |
| | self.hyplen = 0 |
| | self.reflen = 0 |
| | self.total = [0 for i in hits] |
| | for i in sample: |
| | self.hyplen += len(self.hyp.snt[i]) |
| | self.reflen += len(self.ref.snt[i]) |
| | for n in xrange(self.max_n): |
| | hits[n] += self.hits[i][n] |
| | self.total[n] += max(len(self.hyp.snt[i]) - n, 0) |
| | pass |
| | self.prec = [float(hits[n]) / self.total[n] |
| | for n in xrange(self.max_n)] |
| | ret = sum([math.log(x) for x in self.prec]) / self.max_n |
| | self.BP = min( |
| | 1, math.exp(1. - float(self.reflen) / float(self.hyplen))) |
| | ret += math.log(self.BP) |
| | return math.exp(ret) |
| |
|
| |
|
| | class Document: |
| | def __init__(self, fname=None): |
| | self.fname = fname |
| | if fname: |
| | if fname[-3:] == ".gz": |
| | self.snt = [line.strip().split() for line in gzip.open(fname).readlines()] |
| | else: |
| | self.snt = [line.strip().split() for line in open(fname)] |
| | pass |
| | self.ngrams = [count_ngrams(snt, 4) for snt in self.snt] |
| | |
| | else: |
| | self.snt = None |
| | self.ngrams = None |
| |
|
| | def merge(self, R): |
| | self.fname = "multi-ref" |
| | self.ngrams = [x for x in R[0].ngrams] |
| | self.snt = [x for x in R[0].snt] |
| | for i in xrange(len(R[0].ngrams)): |
| | for k in xrange(1, len(R)): |
| | self.ngrams[i] = max_counts(self.ngrams[i], R[k].ngrams[i]) |
| |
|
| | def update(self, hyp, R): |
| | for i, hyp_snt in enumerate(hyp.snt): |
| | clen = len(hyp_snt) |
| | K = 0 |
| | for k in xrange(1, len(R)): |
| | k_snt = R[k].snt[i] |
| | assert len(R[k].snt) == len(hyp.snt), ( |
| | "Mismatch in number of sentences " + |
| | "between reference and candidate") |
| | if abs(len(k_snt) - clen) == abs(len(R[K].snt[i]) - clen): |
| | if len(k_snt) < len(R[K].snt[i]): |
| | K = k |
| | elif abs(len(k_snt) - clen) < abs(len(R[K].snt[i]) - clen): |
| | K = k |
| | self.snt[i] = R[K].snt[i] |
| |
|
| |
|
| | if __name__ == "__main__": |
| | argparser = ArgumentParser() |
| | argparser.add_argument( |
| | "-r", "--ref", nargs='+', help="Reference translation(s).") |
| | argparser.add_argument( |
| | "-c", "--cand", nargs='+', help="Candidate translations.") |
| | argparser.add_argument( |
| | "-i", "--individual", action='store_true', |
| | help="Compute BLEU scores for individual references.") |
| | argparser.add_argument( |
| | "-b", "--bootstrap", type=int, default=1000, |
| | help="Sample size for bootstrap resampling.") |
| | argparser.add_argument( |
| | "-a", "--alpha", type=float, default=.05, |
| | help="1-alpha = confidence interval.") |
| | args = argparser.parse_args(sys.argv[1:]) |
| | R = [Document(fname) for fname in args.ref] |
| | C = [Document(fname) for fname in args.cand] |
| | Rx = Document() |
| | Rx.merge(R) |
| | for c in C: |
| | |
| | Rx.update(c, R) |
| | bleu = BleuScore(c, Rx, bootstrap=args.bootstrap) |
| | print "%5.2f %s [%5.2f-%5.2f; %5.2f] %s" % ( |
| | 100 * bleu.actual, |
| | os.path.basename(Rx.fname), |
| | 100 * bleu.bootstrap[int((args.alpha / 2) * args.bootstrap)], |
| | 100 * bleu.bootstrap[int((1 - (args.alpha / 2)) * args.bootstrap)], |
| | 100 * bleu.bootstrap[int(.5 * args.bootstrap)], |
| | c.fname) |
| |
|
| | if args.individual: |
| | for r in R: |
| | bleu = BleuScore(c, r, bootstrap=args.bootstrap) |
| | print " %5.2f %s" % ( |
| | 100 * bleu.actual, os.path.basename(r.fname)) |
| | |
| |
|
| | |
| | |
| | |
| |
|