File size: 4,174 Bytes
e50fe35
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import unittest
import mock

import os,sys,inspect
currentdir = os.path.dirname(os.path.abspath(inspect.getfile(inspect.currentframe())))
parentdir = os.path.dirname(currentdir)
sys.path.insert(0,parentdir)

from apply_bpe import isolate_glossary, BPE

class TestIsolateGlossaryFunction(unittest.TestCase):

    def setUp(self):
        self.glossary = 'like'

    def _run_test_case(self, test_case):
        orig, expected = test_case
        out = isolate_glossary(orig, self.glossary)
        self.assertEqual(out, expected)

    def test_empty_string(self):
        orig = ''
        exp = ['']
        test_case = (orig, exp)
        self._run_test_case(test_case)

    def test_no_glossary(self):
        orig = 'word'
        exp = ['word']
        test_case = (orig, exp)
        self._run_test_case(test_case)

    def test_isolated_glossary(self):
        orig = 'like'
        exp = ['like']
        test_case = (orig, exp)
        self._run_test_case(test_case)

    def test_word_one_side(self):
        orig = 'likeword'
        exp = ['like', 'word']
        test_case = (orig, exp)
        self._run_test_case(test_case)

    def test_words_both_sides(self):
        orig = 'wordlikeword'
        exp = ['word', 'like', 'word']
        test_case = (orig, exp)
        self._run_test_case(test_case)

    def test_back_to_back_glossary(self):
        orig = 'likelike'
        exp = ['like', 'like']
        test_case = (orig, exp)
        self._run_test_case(test_case)

    def test_multiple_glossaries(self):
        orig = 'wordlikewordlike'
        exp = ['word', 'like', 'word', 'like']
        test_case = (orig, exp)
        self._run_test_case(test_case)

class TestBPEIsolateGlossariesMethod(unittest.TestCase):

    def setUp(self):

        amock = mock.MagicMock()
        amock.readline.return_value = 'something'
        glossaries = ['like', 'Manuel', 'USA']
        self.bpe = BPE(amock, glossaries=glossaries)

    def _run_test_case(self, test_case):
        orig, expected = test_case
        out = self.bpe._isolate_glossaries(orig)
        self.assertEqual(out, expected)

    def test_multiple_glossaries(self):
        orig = 'wordlikeUSAwordManuelManuelwordUSA'
        exp = ['word', 'like', 'USA', 'word', 'Manuel', 'Manuel', 'word', 'USA']
        test_case = (orig, exp)
        self._run_test_case(test_case)

class TestRegexIsolateGlossaries(unittest.TestCase):

    def setUp(self):

        amock = mock.MagicMock()
        amock.readline.return_value = 'something'
        glossaries = ["<country>\w*</country>", "<name>\w*</name>", "\d+"]
        self.bpe = BPE(amock, glossaries=glossaries)

    def _run_test_case(self, test_case):
        orig, expected = test_case
        out = self.bpe._isolate_glossaries(orig)
        self.assertEqual(out, expected)

    def test_regex_glossaries(self):
        orig = 'wordlike<country>USA</country>word10001word<name>Manuel</name>word<country>USA</country>'
        exp = ['wordlike', '<country>USA</country>', 'word', '10001', 'word', '<name>Manuel</name>', 'word', '<country>USA</country>']
        test_case = (orig, exp)
        self._run_test_case(test_case) 

def encode_mock(segment, x2, x3, x4, x5, x6, x7, glosses, dropout):
    if glosses.match(segment):
        return (segment,)
    else:
        l = len(segment)
        return (segment[:l//2], segment[l//2:])

class TestBPESegmentMethod(unittest.TestCase):

    def setUp(self):

        amock = mock.MagicMock()
        amock.readline.return_value = 'something'
        glossaries = ['like', 'Manuel', 'USA']
        self.bpe = BPE(amock, glossaries=glossaries)

    @mock.patch('apply_bpe.encode', side_effect=encode_mock)
    def _run_test_case(self, test_case, encode_function):

        orig, expected = test_case
        out = self.bpe.segment(orig)

        self.assertEqual(out, expected)

    def test_multiple_glossaries(self):
        orig = 'wordlikeword likeManuelword'
        exp = 'wo@@ rd@@ like@@ wo@@ rd like@@ Manuel@@ wo@@ rd'
        test_case = (orig, exp)
        self._run_test_case(test_case)

if __name__ == '__main__':
    unittest.main()