|
|
|
|
|
|
|
from __future__ import unicode_literals |
|
import unittest |
|
import codecs |
|
|
|
import os,sys,inspect |
|
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe()))) |
|
parentdir = os.path.dirname(currentdir) |
|
sys.path.insert(0,parentdir) |
|
|
|
from learn_bpe import learn_bpe |
|
from apply_bpe import BPE |
|
|
|
|
|
class TestBPELearnMethod(unittest.TestCase): |
|
|
|
def test_learn_bpe(self): |
|
infile = codecs.open(os.path.join(currentdir,'data','corpus.en'), encoding='utf-8') |
|
outfile = codecs.open(os.path.join(currentdir,'data','bpe.out'), 'w', encoding='utf-8') |
|
learn_bpe(infile, outfile, 1000) |
|
infile.close() |
|
outfile.close() |
|
|
|
outlines = open(os.path.join(currentdir,'data','bpe.out')) |
|
reflines = open(os.path.join(currentdir,'data','bpe.ref')) |
|
|
|
for line, line2 in zip(outlines, reflines): |
|
self.assertEqual(line, line2) |
|
|
|
outlines.close() |
|
reflines.close() |
|
|
|
class TestBPESegmentMethod(unittest.TestCase): |
|
|
|
def setUp(self): |
|
|
|
with codecs.open(os.path.join(currentdir,'data','bpe.ref'), encoding='utf-8') as bpefile: |
|
self.bpe = BPE(bpefile) |
|
|
|
self.infile = codecs.open(os.path.join(currentdir,'data','corpus.en'), encoding='utf-8') |
|
self.reffile = codecs.open(os.path.join(currentdir,'data','corpus.bpe.ref.en'), encoding='utf-8') |
|
|
|
def tearDown(self): |
|
|
|
self.infile.close() |
|
self.reffile.close() |
|
|
|
def test_apply_bpe(self): |
|
|
|
for line, ref in zip(self.infile, self.reffile): |
|
out = self.bpe.process_line(line) |
|
self.assertEqual(out, ref) |
|
|
|
def test_trailing_whitespace(self): |
|
"""BPE.proces_line() preserves leading and trailing whitespace""" |
|
|
|
orig = ' iron cement \n' |
|
exp = ' ir@@ on c@@ ement \n' |
|
|
|
out = self.bpe.process_line(orig) |
|
self.assertEqual(out, exp) |
|
|
|
def test_utf8_whitespace(self): |
|
"""UTF-8 whitespace is treated as normal character, not word boundary""" |
|
|
|
orig = 'iron\xa0cement\n' |
|
exp = 'ir@@ on@@ \xa0@@ c@@ ement\n' |
|
|
|
out = self.bpe.process_line(orig) |
|
self.assertEqual(out, exp) |
|
|
|
def test_empty_line(self): |
|
|
|
orig = '\n' |
|
exp = '\n' |
|
|
|
out = self.bpe.process_line(orig) |
|
self.assertEqual(out, exp) |
|
|
|
if __name__ == '__main__': |
|
unittest.main() |
|
|