m3hrdadfi commited on
Commit
e432613
1 Parent(s): fcc4e30

Initial model

Browse files
README.md ADDED
@@ -0,0 +1,220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: is
3
+ datasets:
4
+ - malromur
5
+ tags:
6
+ - audio
7
+ - automatic-speech-recognition
8
+ - speech
9
+ - xlsr-fine-tuning-week
10
+ license: apache-2.0
11
+ widget:
12
+ - label: Malromur sample 11
13
+ src: https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/resolve/main/sample11.flac
14
+ - label: Malromur sample 74
15
+ src: https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/resolve/main/sample74.flac
16
+ model-index:
17
+ - name: XLSR Wav2Vec2 Icelandic by Mehrdad Farahani
18
+ results:
19
+ - task:
20
+ name: Speech Recognition
21
+ type: automatic-speech-recognition
22
+ dataset:
23
+ name: Malromur is
24
+ type: malromur
25
+ args: lt
26
+ metrics:
27
+ - name: Test WER
28
+ type: wer
29
+ value: 12.00
30
+
31
+ ---
32
+
33
+ # Wav2Vec2-Large-XLSR-53-Icelandic
34
+
35
+ Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) in Icelandic using [Malromur](https://clarin.is/en/resources/malromur/). When using this model, make sure that your speech input is sampled at 16kHz.
36
+
37
+ ## Usage
38
+ The model can be used directly (without a language model) as follows:
39
+
40
+ **Requirements**
41
+ ```bash
42
+ # requirement packages
43
+ !pip install git+https://github.com/huggingface/datasets.git
44
+ !pip install git+https://github.com/huggingface/transformers.git
45
+ !pip install torchaudio
46
+ !pip install librosa
47
+ !pip install jiwer
48
+ !pip install num2words
49
+ ```
50
+
51
+ **Normalizer**
52
+ ```bash
53
+
54
+ # num2word packages
55
+ # Original source: https://github.com/savoirfairelinux/num2words
56
+ !mkdir -p ./num2words
57
+ !wget -O num2words/__init__.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/num2words/__init__.py
58
+ !wget -O num2words/base.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/num2words/base.py
59
+ !wget -O num2words/compat.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/num2words/compat.py
60
+ !wget -O num2words/currency.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/num2words/currency.py
61
+ !wget -O num2words/lang_EU.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/num2words/lang_EU.py
62
+ !wget -O num2words/lang_IS.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/num2words/lang_IS.py
63
+ !wget -O num2words/utils.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/num2words/utils.py
64
+
65
+ # Malromur_test selected based on gender and age
66
+ !wget -O malromur_test.csv https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/malromur_test.csv
67
+
68
+ # Normalizer
69
+ !wget -O normalizer.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/normalizer.py
70
+ ```
71
+
72
+ **Prediction**
73
+ ```python
74
+ import librosa
75
+ import torch
76
+ import torchaudio
77
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
78
+ from datasets import load_dataset
79
+
80
+ import numpy as np
81
+ import re
82
+ import string
83
+
84
+ import IPython.display as ipd
85
+
86
+ from normalizer import Normalizer
87
+
88
+ normalizer = Normalizer(lang="is")
89
+
90
+
91
+ def speech_file_to_array_fn(batch):
92
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
93
+ speech_array = speech_array.squeeze().numpy()
94
+ speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000)
95
+
96
+ batch["speech"] = speech_array
97
+ return batch
98
+
99
+
100
+ def predict(batch):
101
+ features = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
102
+
103
+ input_values = features.input_values.to(device)
104
+ attention_mask = features.attention_mask.to(device)
105
+
106
+ with torch.no_grad():
107
+ logits = model(input_values, attention_mask=attention_mask).logits
108
+
109
+ pred_ids = torch.argmax(logits, dim=-1)
110
+
111
+ batch["predicted"] = processor.batch_decode(pred_ids)[0]
112
+ return batch
113
+
114
+
115
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
116
+ processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-icelandic")
117
+ model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-icelandic").to(device)
118
+
119
+ dataset = load_dataset("csv", data_files={"test": "./malromur_test.csv"})["test"]
120
+ dataset = dataset.map(
121
+ normalizer,
122
+ fn_kwargs={"remove_extra_space": True},
123
+ remove_columns=list(set(dataset.column_names) - set(['sentence', 'path']))
124
+ )
125
+
126
+ dataset = dataset.map(speech_file_to_array_fn)
127
+ result = dataset.map(predict)
128
+
129
+ max_items = np.random.randint(0, len(result), 20).tolist()
130
+ for i in max_items:
131
+ reference, predicted = result["sentence"][i], result["predicted"][i]
132
+ print("reference:", reference)
133
+ print("predicted:", predicted)
134
+ print('---')
135
+ ```
136
+
137
+ **Output:**
138
+ ```text
139
+ SOON
140
+ ```
141
+
142
+
143
+ ## Evaluation
144
+
145
+ The model can be evaluated as follows on the test data of Common Voice.
146
+
147
+ ```python
148
+ import librosa
149
+ import torch
150
+ import torchaudio
151
+ from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
152
+ from datasets import load_dataset, load_metric
153
+
154
+ import numpy as np
155
+ import re
156
+ import string
157
+
158
+ from normalizer import Normalizer
159
+
160
+ normalizer = Normalizer(lang="is")
161
+
162
+
163
+ def speech_file_to_array_fn(batch):
164
+ speech_array, sampling_rate = torchaudio.load(batch["path"])
165
+ speech_array = speech_array.squeeze().numpy()
166
+ speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000)
167
+
168
+ batch["speech"] = speech_array
169
+ return batch
170
+
171
+
172
+ def predict(batch):
173
+ features = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
174
+
175
+ input_values = features.input_values.to(device)
176
+ attention_mask = features.attention_mask.to(device)
177
+
178
+ with torch.no_grad():
179
+ logits = model(input_values, attention_mask=attention_mask).logits
180
+
181
+ pred_ids = torch.argmax(logits, dim=-1)
182
+
183
+ batch["predicted"] = processor.batch_decode(pred_ids)[0]
184
+ return batch
185
+
186
+
187
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
188
+ processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-icelandic")
189
+ model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-icelandic").to(device)
190
+
191
+ dataset = load_dataset("csv", data_files={"test": "./malromur_test.csv"})["test"]
192
+ dataset = dataset.map(
193
+ normalizer,
194
+ fn_kwargs={"remove_extra_space": True},
195
+ remove_columns=list(set(dataset.column_names) - set(['sentence', 'path']))
196
+ )
197
+
198
+ dataset = dataset.map(speech_file_to_array_fn)
199
+ result = dataset.map(predict)
200
+
201
+ wer = load_metric("wer")
202
+
203
+ print("WER: {:.2f}".format(100 * wer.compute(predictions=result["predicted"], references=result["sentence"])))
204
+ ```
205
+ ]
206
+
207
+ **Test Result**:
208
+ - WER: 12.00%
209
+
210
+
211
+ ## Training & Report
212
+ The Common Voice `train`, `validation` datasets were used for training.
213
+
214
+ You can see the training states [here](#)
215
+
216
+ The script used for training can be found [here](#)
217
+
218
+
219
+ ## Questions?
220
+ Post a Github issue on the [Wav2Vec](https://github.com/m3hrdadfi/wav2vec) repo.
malromur_test.csv ADDED
The diff for this file is too large to render. See raw diff
 
normalizer.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import sys
3
+ import textwrap
4
+ from typing import Any, Dict, Optional
5
+ from num2words import num2words
6
+
7
+
8
+ class Normalizer:
9
+ """A general normalizer for every language"""
10
+
11
+ _whitelist = r"[0-9a-zádðéíóúýþæö]+"
12
+ _dictionary = {}
13
+ _text_key_name: str = "sentence"
14
+ _do_lowercase: bool = True
15
+
16
+ def __init__(
17
+ self,
18
+ whitelist: str = None,
19
+ dictionary: Dict[str, str] = None,
20
+ lang: str = None
21
+ ) -> None:
22
+ self.text_key_name = self._text_key_name
23
+ self.whitelist = whitelist if whitelist and isinstance(whitelist, str) else self._whitelist
24
+ self.dictionary = dictionary if dictionary and isinstance(dictionary, dict) else self._dictionary
25
+ self.do_lowercase = self._do_lowercase
26
+ self.lang = lang
27
+
28
+ def chars_to_map(self, sentence: str) -> str:
29
+ """Maps every character, words, and phrase into a proper one.
30
+
31
+ Args:
32
+ sentence (str): A piece of text.
33
+ """
34
+ if not len(self.dictionary) > 0:
35
+ return sentence
36
+
37
+ pattern = "|".join(map(re.escape, self.dictionary.keys()))
38
+ return re.sub(pattern, lambda m: self.dictionary[m.group()], str(sentence))
39
+
40
+ def chars_to_preserve(
41
+ self,
42
+ sentence: str,
43
+ ) -> str:
44
+ """Keeps specified characters from sentence
45
+
46
+ Args:
47
+ sentence (str): A piece of text.
48
+ """
49
+ try:
50
+ tokenized = re.findall(self.whitelist, sentence, re.IGNORECASE)
51
+ return " ".join(tokenized)
52
+ except Exception as error:
53
+ print(
54
+ textwrap.dedent(
55
+ f"""
56
+ Bad characters range {self.whitelist},
57
+ {error}
58
+ """
59
+ )
60
+ )
61
+ raise
62
+
63
+ def text_level_normalizer(self, sentence: str, *args: Any, **kwargs: Any) -> str:
64
+ """A text level of normalization.
65
+ It is handy for some languages that need to add a hierarchy of
66
+ normalization and filtering at the text level.
67
+
68
+ Args:
69
+ sentence (str): A piece of text.
70
+ """
71
+ text = sentence
72
+ if not self.lang:
73
+ return text
74
+
75
+ _text = []
76
+ for word in text.split():
77
+
78
+ try:
79
+ word = int(word)
80
+ word = str(num2words(word, lang=self.lang))
81
+ except:
82
+ word = str(word)
83
+
84
+ _text.append(word)
85
+
86
+ return " ".join(_text)
87
+
88
+ def __call__(
89
+ self,
90
+ batch: Dict,
91
+ return_dict: bool = True,
92
+ do_lastspace_removing: bool = False,
93
+ text_key_name: Optional[str] = None,
94
+ do_lowercase: Optional[bool] = None,
95
+ *args: Any,
96
+ **kwargs: Any,
97
+ ) -> Any:
98
+ """Normalization caller
99
+
100
+ Args:
101
+ batch (Dict): A batch of input.
102
+ text_key_name (str, optional): The key name of text in the batch input.
103
+ return_dict (bool, optional): Whether to return dictionary of batch or not just the text. Defaults to True.
104
+ do_lastspace_removing (bool, optional): Whether to add extra space at the end of text or not. Defaults to True.
105
+ do_lowercase (bool, optional): Whether to do lowercase or not. Defaults to None.
106
+ """
107
+
108
+ text_key_name = text_key_name if text_key_name else self.text_key_name
109
+ do_lowercase = do_lowercase if isinstance(do_lowercase, bool) else self.do_lowercase
110
+
111
+ if text_key_name not in batch:
112
+ raise KeyError(
113
+ textwrap.dedent(
114
+ f"""
115
+ keyname {text_key_name} not existed in the batch dictionary,
116
+ the batch dictionary consists of the following keys {list(batch.keys())},
117
+ you can easily add a new keyname by passing the `text_key_name` into Normalizer.
118
+ """
119
+ )
120
+ )
121
+
122
+ text = batch[text_key_name].strip()
123
+
124
+ if do_lowercase:
125
+ text = text.lower()
126
+
127
+ text = self.chars_to_map(text)
128
+ text = self.chars_to_preserve(text)
129
+ text = self.text_level_normalizer(text, *args, **kwargs)
130
+
131
+ text = text.strip()
132
+ if not do_lastspace_removing:
133
+ text = text + " "
134
+
135
+ if not return_dict:
136
+ return text
137
+
138
+ batch[text_key_name] = text
139
+ return batch
num2words/__init__.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) 2003, Taro Ogawa. All Rights Reserved.
3
+ # Copyright (c) 2013, Savoir-faire Linux inc. All Rights Reserved.
4
+
5
+ # This library is free software; you can redistribute it and/or
6
+ # modify it under the terms of the GNU Lesser General Public
7
+ # License as published by the Free Software Foundation; either
8
+ # version 2.1 of the License, or (at your option) any later version.
9
+ # This library is distributed in the hope that it will be useful,
10
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12
+ # Lesser General Public License for more details.
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
16
+ # MA 02110-1301 USA
17
+
18
+ from __future__ import unicode_literals
19
+
20
+ from . import (
21
+ lang_IS
22
+ )
23
+
24
+ CONVERTER_CLASSES = {
25
+ 'is': lang_IS.Num2Word_IS()
26
+ }
27
+
28
+ CONVERTES_TYPES = ['cardinal', 'ordinal', 'ordinal_num', 'year', 'currency']
29
+
30
+
31
+ def num2words(number, ordinal=False, lang='en', to='cardinal', **kwargs):
32
+ # We try the full language first
33
+ if lang not in CONVERTER_CLASSES:
34
+ # ... and then try only the first 2 letters
35
+ lang = lang[:2]
36
+ if lang not in CONVERTER_CLASSES:
37
+ raise NotImplementedError()
38
+ converter = CONVERTER_CLASSES[lang]
39
+
40
+ if isinstance(number, str):
41
+ number = converter.str_to_number(number)
42
+
43
+ # backwards compatible
44
+ if ordinal:
45
+ return converter.to_ordinal(number)
46
+
47
+ if to not in CONVERTES_TYPES:
48
+ raise NotImplementedError()
49
+
50
+ return getattr(converter, 'to_{}'.format(to))(number, **kwargs)
num2words/base.py ADDED
@@ -0,0 +1,306 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) 2003, Taro Ogawa. All Rights Reserved.
3
+ # Copyright (c) 2013, Savoir-faire Linux inc. All Rights Reserved.
4
+
5
+ # This library is free software; you can redistribute it and/or
6
+ # modify it under the terms of the GNU Lesser General Public
7
+ # License as published by the Free Software Foundation; either
8
+ # version 2.1 of the License, or (at your option) any later version.
9
+ # This library is distributed in the hope that it will be useful,
10
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12
+ # Lesser General Public License for more details.
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
16
+ # MA 02110-1301 USA
17
+
18
+ from __future__ import unicode_literals
19
+
20
+ import math
21
+ from collections import OrderedDict
22
+ from decimal import Decimal
23
+
24
+ from .compat import to_s
25
+ from .currency import parse_currency_parts, prefix_currency
26
+
27
+
28
+ class Num2Word_Base(object):
29
+ CURRENCY_FORMS = {}
30
+ CURRENCY_ADJECTIVES = {}
31
+
32
+ def __init__(self):
33
+ self.is_title = False
34
+ self.precision = 2
35
+ self.exclude_title = []
36
+ self.negword = "(-) "
37
+ self.pointword = "(.)"
38
+ self.errmsg_nonnum = "type(%s) not in [long, int, float]"
39
+ self.errmsg_floatord = "Cannot treat float %s as ordinal."
40
+ self.errmsg_negord = "Cannot treat negative num %s as ordinal."
41
+ self.errmsg_toobig = "abs(%s) must be less than %s."
42
+
43
+ self.setup()
44
+
45
+ # uses cards
46
+ if any(hasattr(self, field) for field in
47
+ ['high_numwords', 'mid_numwords', 'low_numwords']):
48
+ self.cards = OrderedDict()
49
+ self.set_numwords()
50
+ self.MAXVAL = 1000 * list(self.cards.keys())[0]
51
+
52
+ def set_numwords(self):
53
+ self.set_high_numwords(self.high_numwords)
54
+ self.set_mid_numwords(self.mid_numwords)
55
+ self.set_low_numwords(self.low_numwords)
56
+
57
+ def set_high_numwords(self, *args):
58
+ raise NotImplementedError
59
+
60
+ def set_mid_numwords(self, mid):
61
+ for key, val in mid:
62
+ self.cards[key] = val
63
+
64
+ def set_low_numwords(self, numwords):
65
+ for word, n in zip(numwords, range(len(numwords) - 1, -1, -1)):
66
+ self.cards[n] = word
67
+
68
+ def splitnum(self, value):
69
+ for elem in self.cards:
70
+ if elem > value:
71
+ continue
72
+
73
+ out = []
74
+ if value == 0:
75
+ div, mod = 1, 0
76
+ else:
77
+ div, mod = divmod(value, elem)
78
+
79
+ if div == 1:
80
+ out.append((self.cards[1], 1))
81
+ else:
82
+ if div == value: # The system tallies, eg Roman Numerals
83
+ return [(div * self.cards[elem], div*elem)]
84
+ out.append(self.splitnum(div))
85
+
86
+ out.append((self.cards[elem], elem))
87
+
88
+ if mod:
89
+ out.append(self.splitnum(mod))
90
+
91
+ return out
92
+
93
+ def parse_minus(self, num_str):
94
+ """Detach minus and return it as symbol with new num_str."""
95
+ if num_str.startswith('-'):
96
+ # Extra spacing to compensate if there is no minus.
97
+ return '%s ' % self.negword, num_str[1:]
98
+ return '', num_str
99
+
100
+ def str_to_number(self, value):
101
+ return Decimal(value)
102
+
103
+ def to_cardinal(self, value):
104
+ try:
105
+ assert int(value) == value
106
+ except (ValueError, TypeError, AssertionError):
107
+ return self.to_cardinal_float(value)
108
+
109
+ out = ""
110
+ if value < 0:
111
+ value = abs(value)
112
+ out = self.negword
113
+
114
+ if value >= self.MAXVAL:
115
+ raise OverflowError(self.errmsg_toobig % (value, self.MAXVAL))
116
+
117
+ val = self.splitnum(value)
118
+ words, num = self.clean(val)
119
+ return self.title(out + words)
120
+
121
+ def float2tuple(self, value):
122
+ pre = int(value)
123
+
124
+ # Simple way of finding decimal places to update the precision
125
+ self.precision = abs(Decimal(str(value)).as_tuple().exponent)
126
+
127
+ post = abs(value - pre) * 10**self.precision
128
+ if abs(round(post) - post) < 0.01:
129
+ # We generally floor all values beyond our precision (rather than
130
+ # rounding), but in cases where we have something like 1.239999999,
131
+ # which is probably due to python's handling of floats, we actually
132
+ # want to consider it as 1.24 instead of 1.23
133
+ post = int(round(post))
134
+ else:
135
+ post = int(math.floor(post))
136
+
137
+ return pre, post
138
+
139
+ def to_cardinal_float(self, value):
140
+ try:
141
+ float(value) == value
142
+ except (ValueError, TypeError, AssertionError, AttributeError):
143
+ raise TypeError(self.errmsg_nonnum % value)
144
+
145
+ pre, post = self.float2tuple(float(value))
146
+
147
+ post = str(post)
148
+ post = '0' * (self.precision - len(post)) + post
149
+
150
+ out = [self.to_cardinal(pre)]
151
+ if self.precision:
152
+ out.append(self.title(self.pointword))
153
+
154
+ for i in range(self.precision):
155
+ curr = int(post[i])
156
+ out.append(to_s(self.to_cardinal(curr)))
157
+
158
+ return " ".join(out)
159
+
160
+ def merge(self, curr, next):
161
+ raise NotImplementedError
162
+
163
+ def clean(self, val):
164
+ out = val
165
+ while len(val) != 1:
166
+ out = []
167
+ left, right = val[:2]
168
+ if isinstance(left, tuple) and isinstance(right, tuple):
169
+ out.append(self.merge(left, right))
170
+ if val[2:]:
171
+ out.append(val[2:])
172
+ else:
173
+ for elem in val:
174
+ if isinstance(elem, list):
175
+ if len(elem) == 1:
176
+ out.append(elem[0])
177
+ else:
178
+ out.append(self.clean(elem))
179
+ else:
180
+ out.append(elem)
181
+ val = out
182
+ return out[0]
183
+
184
+ def title(self, value):
185
+ if self.is_title:
186
+ out = []
187
+ value = value.split()
188
+ for word in value:
189
+ if word in self.exclude_title:
190
+ out.append(word)
191
+ else:
192
+ out.append(word[0].upper() + word[1:])
193
+ value = " ".join(out)
194
+ return value
195
+
196
+ def verify_ordinal(self, value):
197
+ if not value == int(value):
198
+ raise TypeError(self.errmsg_floatord % value)
199
+ if not abs(value) == value:
200
+ raise TypeError(self.errmsg_negord % value)
201
+
202
+ def to_ordinal(self, value):
203
+ return self.to_cardinal(value)
204
+
205
+ def to_ordinal_num(self, value):
206
+ return value
207
+
208
+ # Trivial version
209
+ def inflect(self, value, text):
210
+ text = text.split("/")
211
+ if value == 1:
212
+ return text[0]
213
+ return "".join(text)
214
+
215
+ # //CHECK: generalise? Any others like pounds/shillings/pence?
216
+ def to_splitnum(self, val, hightxt="", lowtxt="", jointxt="",
217
+ divisor=100, longval=True, cents=True):
218
+ out = []
219
+
220
+ if isinstance(val, float):
221
+ high, low = self.float2tuple(val)
222
+ else:
223
+ try:
224
+ high, low = val
225
+ except TypeError:
226
+ high, low = divmod(val, divisor)
227
+
228
+ if high:
229
+ hightxt = self.title(self.inflect(high, hightxt))
230
+ out.append(self.to_cardinal(high))
231
+ if low:
232
+ if longval:
233
+ if hightxt:
234
+ out.append(hightxt)
235
+ if jointxt:
236
+ out.append(self.title(jointxt))
237
+ elif hightxt:
238
+ out.append(hightxt)
239
+
240
+ if low:
241
+ if cents:
242
+ out.append(self.to_cardinal(low))
243
+ else:
244
+ out.append("%02d" % low)
245
+ if lowtxt and longval:
246
+ out.append(self.title(self.inflect(low, lowtxt)))
247
+
248
+ return " ".join(out)
249
+
250
+ def to_year(self, value, **kwargs):
251
+ return self.to_cardinal(value)
252
+
253
+ def pluralize(self, n, forms):
254
+ """
255
+ Should resolve gettext form:
256
+ http://docs.translatehouse.org/projects/localization-guide/en/latest/l10n/pluralforms.html
257
+ """
258
+ raise NotImplementedError
259
+
260
+ def _cents_verbose(self, number, currency):
261
+ return self.to_cardinal(number)
262
+
263
+ def _cents_terse(self, number, currency):
264
+ return "%02d" % number
265
+
266
+ def to_currency(self, val, currency='EUR', cents=True, separator=',',
267
+ adjective=False):
268
+ """
269
+ Args:
270
+ val: Numeric value
271
+ currency (str): Currency code
272
+ cents (bool): Verbose cents
273
+ separator (str): Cent separator
274
+ adjective (bool): Prefix currency name with adjective
275
+ Returns:
276
+ str: Formatted string
277
+
278
+ """
279
+ left, right, is_negative = parse_currency_parts(val)
280
+
281
+ try:
282
+ cr1, cr2 = self.CURRENCY_FORMS[currency]
283
+
284
+ except KeyError:
285
+ raise NotImplementedError(
286
+ 'Currency code "%s" not implemented for "%s"' %
287
+ (currency, self.__class__.__name__))
288
+
289
+ if adjective and currency in self.CURRENCY_ADJECTIVES:
290
+ cr1 = prefix_currency(self.CURRENCY_ADJECTIVES[currency], cr1)
291
+
292
+ minus_str = "%s " % self.negword if is_negative else ""
293
+ cents_str = self._cents_verbose(right, currency) \
294
+ if cents else self._cents_terse(right, currency)
295
+
296
+ return u'%s%s %s%s %s %s' % (
297
+ minus_str,
298
+ self.to_cardinal(left),
299
+ self.pluralize(left, cr1),
300
+ separator,
301
+ cents_str,
302
+ self.pluralize(right, cr2)
303
+ )
304
+
305
+ def setup(self):
306
+ pass
num2words/compat.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) 2003, Taro Ogawa. All Rights Reserved.
3
+ # Copyright (c) 2013, Savoir-faire Linux inc. All Rights Reserved.
4
+
5
+ # This library is free software; you can redistribute it and/or
6
+ # modify it under the terms of the GNU Lesser General Public
7
+ # License as published by the Free Software Foundation; either
8
+ # version 2.1 of the License, or (at your option) any later version.
9
+ # This library is distributed in the hope that it will be useful,
10
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12
+ # Lesser General Public License for more details.
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
16
+ # MA 02110-1301 USA
17
+
18
+
19
+ try:
20
+ strtype = basestring
21
+ except NameError:
22
+ strtype = str
23
+
24
+
25
+ def to_s(val):
26
+ try:
27
+ return unicode(val)
28
+ except NameError:
29
+ return str(val)
num2words/currency.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) 2003, Taro Ogawa. All Rights Reserved.
3
+ # Copyright (c) 2013, Savoir-faire Linux inc. All Rights Reserved.
4
+
5
+ # This library is free software; you can redistribute it and/or
6
+ # modify it under the terms of the GNU Lesser General Public
7
+ # License as published by the Free Software Foundation; either
8
+ # version 2.1 of the License, or (at your option) any later version.
9
+ # This library is distributed in the hope that it will be useful,
10
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12
+ # Lesser General Public License for more details.
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
16
+ # MA 02110-1301 USA
17
+
18
+ from __future__ import division
19
+
20
+ from decimal import ROUND_HALF_UP, Decimal
21
+
22
+
23
+ def parse_currency_parts(value, is_int_with_cents=True):
24
+ if isinstance(value, int):
25
+ if is_int_with_cents:
26
+ # assume cents if value is integer
27
+ negative = value < 0
28
+ value = abs(value)
29
+ integer, cents = divmod(value, 100)
30
+ else:
31
+ negative = value < 0
32
+ integer, cents = abs(value), 0
33
+
34
+ else:
35
+ value = Decimal(value)
36
+ value = value.quantize(
37
+ Decimal('.01'),
38
+ rounding=ROUND_HALF_UP
39
+ )
40
+ negative = value < 0
41
+ value = abs(value)
42
+ integer, fraction = divmod(value, 1)
43
+ integer = int(integer)
44
+ cents = int(fraction * 100)
45
+
46
+ return integer, cents, negative
47
+
48
+
49
+ def prefix_currency(prefix, base):
50
+ return tuple("%s %s" % (prefix, i) for i in base)
num2words/lang_EU.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) 2003, Taro Ogawa. All Rights Reserved.
3
+ # Copyright (c) 2013, Savoir-faire Linux inc. All Rights Reserved.
4
+
5
+ # This library is free software; you can redistribute it and/or
6
+ # modify it under the terms of the GNU Lesser General Public
7
+ # License as published by the Free Software Foundation; either
8
+ # version 2.1 of the License, or (at your option) any later version.
9
+ # This library is distributed in the hope that it will be useful,
10
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12
+ # Lesser General Public License for more details.
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
16
+ # MA 02110-1301 USA
17
+
18
+ from __future__ import unicode_literals
19
+
20
+ from .base import Num2Word_Base
21
+
22
+ GENERIC_DOLLARS = ('dollar', 'dollars')
23
+ GENERIC_CENTS = ('cent', 'cents')
24
+
25
+
26
+ class Num2Word_EU(Num2Word_Base):
27
+ CURRENCY_FORMS = {
28
+ 'AUD': (GENERIC_DOLLARS, GENERIC_CENTS),
29
+ 'CAD': (GENERIC_DOLLARS, GENERIC_CENTS),
30
+ # repalced by EUR
31
+ 'EEK': (('kroon', 'kroons'), ('sent', 'senti')),
32
+ 'EUR': (('euro', 'euro'), GENERIC_CENTS),
33
+ 'GBP': (('pound sterling', 'pounds sterling'), ('penny', 'pence')),
34
+ # replaced by EUR
35
+ 'LTL': (('litas', 'litas'), GENERIC_CENTS),
36
+ # replaced by EUR
37
+ 'LVL': (('lat', 'lats'), ('santim', 'santims')),
38
+ 'USD': (GENERIC_DOLLARS, GENERIC_CENTS),
39
+ 'RUB': (('rouble', 'roubles'), ('kopek', 'kopeks')),
40
+ 'SEK': (('krona', 'kronor'), ('öre', 'öre')),
41
+ 'NOK': (('krone', 'kroner'), ('øre', 'øre')),
42
+ 'PLN': (('zloty', 'zlotys', 'zlotu'), ('grosz', 'groszy')),
43
+ 'MXN': (('peso', 'pesos'), GENERIC_CENTS),
44
+ 'RON': (('leu', 'lei', 'de lei'), ('ban', 'bani', 'de bani')),
45
+ 'INR': (('rupee', 'rupees'), ('paisa', 'paise')),
46
+ 'HUF': (('forint', 'forint'), ('fillér', 'fillér')),
47
+ 'ISK': (('króna', 'krónur'), ('aur', 'aurar')),
48
+ }
49
+
50
+ CURRENCY_ADJECTIVES = {
51
+ 'AUD': 'Australian',
52
+ 'CAD': 'Canadian',
53
+ 'EEK': 'Estonian',
54
+ 'USD': 'US',
55
+ 'RUB': 'Russian',
56
+ 'NOK': 'Norwegian',
57
+ 'MXN': 'Mexican',
58
+ 'RON': 'Romanian',
59
+ 'INR': 'Indian',
60
+ 'HUF': 'Hungarian',
61
+ 'ISK': 'íslenskar',
62
+ }
63
+
64
+ GIGA_SUFFIX = "illiard"
65
+ MEGA_SUFFIX = "illion"
66
+
67
+ def set_high_numwords(self, high):
68
+ cap = 3 + 6 * len(high)
69
+
70
+ for word, n in zip(high, range(cap, 3, -6)):
71
+ if self.GIGA_SUFFIX:
72
+ self.cards[10 ** n] = word + self.GIGA_SUFFIX
73
+
74
+ if self.MEGA_SUFFIX:
75
+ self.cards[10 ** (n - 3)] = word + self.MEGA_SUFFIX
76
+
77
+ def gen_high_numwords(self, units, tens, lows):
78
+ out = [u + t for t in tens for u in units]
79
+ out.reverse()
80
+ return out + lows
81
+
82
+ def pluralize(self, n, forms):
83
+ form = 0 if n == 1 else 1
84
+ return forms[form]
85
+
86
+ def setup(self):
87
+ lows = ["non", "oct", "sept", "sext", "quint", "quadr", "tr", "b", "m"]
88
+ units = ["", "un", "duo", "tre", "quattuor", "quin", "sex", "sept",
89
+ "octo", "novem"]
90
+ tens = ["dec", "vigint", "trigint", "quadragint", "quinquagint",
91
+ "sexagint", "septuagint", "octogint", "nonagint"]
92
+ self.high_numwords = ["cent"] + self.gen_high_numwords(units, tens,
93
+ lows)
num2words/lang_IS.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # -*- coding: utf-8 -*-
3
+ # Copyright (c) 2003, Taro Ogawa. All Rights Reserved.
4
+ # Copyright (c) 2013, Savoir-faire Linux inc. All Rights Reserved.
5
+
6
+ # This library is free software; you can redistribute it and/or
7
+ # modify it under the terms of the GNU Lesser General Public
8
+ # License as published by the Free Software Foundation; either
9
+ # version 2.1 of the License, or (at your option) any later version.
10
+ # This library is distributed in the hope that it will be useful,
11
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
12
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13
+ # Lesser General Public License for more details.
14
+ # You should have received a copy of the GNU Lesser General Public
15
+ # License along with this library; if not, write to the Free Software
16
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
17
+ # MA 02110-1301 USA
18
+
19
+ from __future__ import division, print_function, unicode_literals
20
+
21
+ from . import lang_EU
22
+
23
+ # Genders
24
+ KK = 0 # Karlkyn (male)
25
+ KVK = 1 # Kvenkyn (female)
26
+ HK = 2 # Hvorugkyn (neuter)
27
+
28
+ GENDERS = {
29
+ "einn": ("einn", "ein", "eitt"),
30
+ "tveir": ("tveir", "tvær", "tvö"),
31
+ "þrír": ("þrír", "þrjár", "þrjú"),
32
+ "fjórir": ("fjórir", "fjórar", "fjögur"),
33
+ }
34
+
35
+ PLURALS = {
36
+ "hundrað": ("hundrað", "hundruð"),
37
+ }
38
+
39
+
40
+ class Num2Word_IS(lang_EU.Num2Word_EU):
41
+
42
+ GIGA_SUFFIX = "illjarður"
43
+ MEGA_SUFFIX = "illjón"
44
+
45
+ def setup(self):
46
+ lows = ["okt", "sept", "sext", "kvint", "kvaðr", "tr", "b", "m"]
47
+ self.high_numwords = self.gen_high_numwords([], [], lows)
48
+
49
+ self.negword = "mínus "
50
+ self.pointword = "komma"
51
+
52
+ # All words should be excluded, title case is not used in Icelandic
53
+ self.exclude_title = ["og", "komma", "mínus"]
54
+
55
+ self.mid_numwords = [(1000, "þúsund"), (100, "hundrað"),
56
+ (90, "níutíu"), (80, "áttatíu"), (70, "sjötíu"),
57
+ (60, "sextíu"), (50, "fimmtíu"), (40, "fjörutíu"),
58
+ (30, "þrjátíu")]
59
+ self.low_numwords = ["tuttugu", "nítján", "átján", "sautján",
60
+ "sextán", "fimmtán", "fjórtán", "þrettán",
61
+ "tólf", "ellefu", "tíu", "níu", "átta",
62
+ "sjö", "sex", "fimm", "fjórir", "þrír",
63
+ "tveir", "einn", "núll"]
64
+ self.ords = {"einn": "fyrsti",
65
+ "tveir": "annar",
66
+ "þrír": "þriðji",
67
+ "fjórir": "fjórði",
68
+ "fimm": "fimmti",
69
+ "sex": "sjötti",
70
+ "sjö": "sjöundi",
71
+ "átta": "áttundi",
72
+ "níu": "níundi",
73
+ "tíu": "tíundi",
74
+ "ellefu": "ellefti",
75
+ "tólf": "tólfti"}
76
+
77
+ def pluralize(self, n, noun):
78
+ form = 0 if (n % 10 == 1 and n % 100 != 11) else 1
79
+ if form == 0:
80
+ return noun
81
+ elif self.GIGA_SUFFIX in noun:
82
+ return noun.replace(self.GIGA_SUFFIX, "illjarðar")
83
+ elif self.MEGA_SUFFIX in noun:
84
+ return noun.replace(self.MEGA_SUFFIX, "illjónir")
85
+ elif noun not in PLURALS:
86
+ return noun
87
+ return PLURALS[noun][form]
88
+
89
+ def genderize(self, adj, noun):
90
+ last = adj.split()[-1]
91
+ if last not in GENDERS:
92
+ return adj
93
+ gender = KK
94
+ if "hund" in noun or "þús" in noun:
95
+ gender = HK
96
+ elif "illjarð" in noun:
97
+ gender = KK
98
+ elif "illjón" in noun:
99
+ gender = KVK
100
+ return adj.replace(last, GENDERS[last][gender])
101
+
102
+ def merge(self, lpair, rpair):
103
+ ltext, lnum = lpair
104
+ rtext, rnum = rpair
105
+
106
+ if lnum == 1 and rnum < 100:
107
+ return (rtext, rnum)
108
+ elif lnum < rnum:
109
+ rtext = self.pluralize(lnum, rtext)
110
+ ltext = self.genderize(ltext, rtext)
111
+ return ("%s %s" % (ltext, rtext), lnum * rnum)
112
+ elif lnum > rnum and rnum in self.cards:
113
+ rtext = self.pluralize(lnum, rtext)
114
+ ltext = self.genderize(ltext, rtext)
115
+ return ("%s og %s" % (ltext, rtext), lnum + rnum)
116
+ return ("%s %s" % (ltext, rtext), lnum + rnum)
117
+
118
+ def to_ordinal(self, value):
119
+ raise NotImplementedError
120
+
121
+ def to_ordinal_num(self, value):
122
+ raise NotImplementedError
123
+
124
+ def to_year(self, val, suffix=None, longval=True):
125
+ raise NotImplementedError
126
+
127
+ def to_currency(self, val, longval=True):
128
+ raise NotImplementedError
num2words/utils.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*-
2
+ # Copyright (c) 2003, Taro Ogawa. All Rights Reserved.
3
+ # Copyright (c) 2013, Savoir-faire Linux inc. All Rights Reserved.
4
+
5
+ # This library is free software; you can redistribute it and/or
6
+ # modify it under the terms of the GNU Lesser General Public
7
+ # License as published by the Free Software Foundation; either
8
+ # version 2.1 of the License, or (at your option) any later version.
9
+ # This library is distributed in the hope that it will be useful,
10
+ # but WITHOUT ANY WARRANTY; without even the implied warranty of
11
+ # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12
+ # Lesser General Public License for more details.
13
+ # You should have received a copy of the GNU Lesser General Public
14
+ # License along with this library; if not, write to the Free Software
15
+ # Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
16
+ # MA 02110-1301 USA
17
+
18
+
19
+ def splitbyx(n, x, format_int=True):
20
+ length = len(n)
21
+ if length > x:
22
+ start = length % x
23
+ if start > 0:
24
+ result = n[:start]
25
+ yield int(result) if format_int else result
26
+ for i in range(start, length, x):
27
+ result = n[i:i+x]
28
+ yield int(result) if format_int else result
29
+ else:
30
+ yield int(n) if format_int else n
31
+
32
+
33
+ def get_digits(n):
34
+ a = [int(x) for x in reversed(list(('%03d' % n)[-3:]))]
35
+ return a