Initial model

Browse files

Files changed (10) hide show

README.md +220 -0
malromur_test.csv +0 -0
normalizer.py +139 -0
num2words/__init__.py +50 -0
num2words/base.py +306 -0
num2words/compat.py +29 -0
num2words/currency.py +50 -0
num2words/lang_EU.py +93 -0
num2words/lang_IS.py +128 -0
num2words/utils.py +35 -0

README.md ADDED Viewed

	@@ -0,0 +1,220 @@

+---
+language: is
+datasets:
+- malromur
+tags:
+- audio
+- automatic-speech-recognition
+- speech
+- xlsr-fine-tuning-week
+license: apache-2.0
+widget:
+- label: Malromur sample 11
+  src: https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/resolve/main/sample11.flac
+- label: Malromur sample 74
+  src: https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/resolve/main/sample74.flac
+model-index:
+- name: XLSR Wav2Vec2 Icelandic by Mehrdad Farahani
+  results:
+  - task:
+      name: Speech Recognition
+      type: automatic-speech-recognition
+    dataset:
+      name: Malromur is
+      type: malromur
+      args: lt
+    metrics:
+       - name: Test WER
+         type: wer
+         value: 12.00
+---
+# Wav2Vec2-Large-XLSR-53-Icelandic
+Fine-tuned [facebook/wav2vec2-large-xlsr-53](https://huggingface.co/facebook/wav2vec2-large-xlsr-53) in Icelandic using [Malromur](https://clarin.is/en/resources/malromur/). When using this model, make sure that your speech input is sampled at 16kHz.
+## Usage
+The model can be used directly (without a language model) as follows:
+**Requirements**
+```bash
+# requirement packages
+!pip install git+https://github.com/huggingface/datasets.git
+!pip install git+https://github.com/huggingface/transformers.git
+!pip install torchaudio
+!pip install librosa
+!pip install jiwer
+!pip install num2words
+```
+**Normalizer**
+```bash
+# num2word packages
+# Original source: https://github.com/savoirfairelinux/num2words
+!mkdir -p ./num2words
+!wget -O num2words/__init__.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/num2words/__init__.py
+!wget -O num2words/base.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/num2words/base.py
+!wget -O num2words/compat.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/num2words/compat.py
+!wget -O num2words/currency.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/num2words/currency.py
+!wget -O num2words/lang_EU.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/num2words/lang_EU.py
+!wget -O num2words/lang_IS.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/num2words/lang_IS.py
+!wget -O num2words/utils.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/num2words/utils.py
+# Malromur_test selected based on gender and age
+!wget -O malromur_test.csv https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/malromur_test.csv
+# Normalizer
+!wget -O normalizer.py https://huggingface.co/m3hrdadfi/wav2vec2-large-xlsr-icelandic/raw/main/normalizer.py
+```
+**Prediction**
+```python
+import librosa
+import torch
+import torchaudio
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+from datasets import load_dataset
+import numpy as np
+import re
+import string
+import IPython.display as ipd
+from normalizer import Normalizer
+normalizer = Normalizer(lang="is")
+def speech_file_to_array_fn(batch):
+    speech_array, sampling_rate = torchaudio.load(batch["path"])
+    speech_array = speech_array.squeeze().numpy()
+    speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000)
+    batch["speech"] = speech_array
+    return batch
+def predict(batch):
+    features = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
+    input_values = features.input_values.to(device)
+    attention_mask = features.attention_mask.to(device)
+    with torch.no_grad():
+        logits = model(input_values, attention_mask=attention_mask).logits
+    pred_ids = torch.argmax(logits, dim=-1)
+    batch["predicted"] = processor.batch_decode(pred_ids)[0]
+    return batch
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-icelandic")
+model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-icelandic").to(device)
+dataset = load_dataset("csv", data_files={"test": "./malromur_test.csv"})["test"]
+dataset = dataset.map(
+    normalizer,
+    fn_kwargs={"remove_extra_space": True},
+    remove_columns=list(set(dataset.column_names) - set(['sentence', 'path']))
+)
+dataset = dataset.map(speech_file_to_array_fn)
+result = dataset.map(predict)
+max_items = np.random.randint(0, len(result), 20).tolist()
+for i in max_items:
+    reference, predicted =  result["sentence"][i], result["predicted"][i]
+    print("reference:", reference)
+    print("predicted:", predicted)
+    print('---')
+```
+**Output:**
+```text
+SOON
+```
+## Evaluation
+The model can be evaluated as follows on the test data of Common Voice.
+```python
+import librosa
+import torch
+import torchaudio
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+from datasets import load_dataset, load_metric
+import numpy as np
+import re
+import string
+from normalizer import Normalizer
+normalizer = Normalizer(lang="is")
+def speech_file_to_array_fn(batch):
+    speech_array, sampling_rate = torchaudio.load(batch["path"])
+    speech_array = speech_array.squeeze().numpy()
+    speech_array = librosa.resample(np.asarray(speech_array), sampling_rate, 16_000)
+    batch["speech"] = speech_array
+    return batch
+def predict(batch):
+    features = processor(batch["speech"], sampling_rate=16_000, return_tensors="pt", padding=True)
+    input_values = features.input_values.to(device)
+    attention_mask = features.attention_mask.to(device)
+    with torch.no_grad():
+        logits = model(input_values, attention_mask=attention_mask).logits
+    pred_ids = torch.argmax(logits, dim=-1)
+    batch["predicted"] = processor.batch_decode(pred_ids)[0]
+    return batch
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+processor = Wav2Vec2Processor.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-icelandic")
+model = Wav2Vec2ForCTC.from_pretrained("m3hrdadfi/wav2vec2-large-xlsr-icelandic").to(device)
+dataset = load_dataset("csv", data_files={"test": "./malromur_test.csv"})["test"]
+dataset = dataset.map(
+    normalizer,
+    fn_kwargs={"remove_extra_space": True},
+    remove_columns=list(set(dataset.column_names) - set(['sentence', 'path']))
+)
+dataset = dataset.map(speech_file_to_array_fn)
+result = dataset.map(predict)
+wer = load_metric("wer")
+print("WER: {:.2f}".format(100 * wer.compute(predictions=result["predicted"], references=result["sentence"])))
+```
+]
+**Test Result**:
+- WER: 12.00%
+## Training & Report
+The Common Voice `train`, `validation` datasets were used for training.
+You can see the training states [here](#)
+The script used for training can be found [here](#)
+## Questions?
+Post a Github issue on the [Wav2Vec](https://github.com/m3hrdadfi/wav2vec) repo.

malromur_test.csv ADDED Viewed

The diff for this file is too large to render. See raw diff

normalizer.py ADDED Viewed

	@@ -0,0 +1,139 @@

+import re
+import sys
+import textwrap
+from typing import Any, Dict, Optional
+from num2words import num2words
+class Normalizer:
+    """A general normalizer for every language"""
+    _whitelist = r"[0-9a-zádðéíóúýþæö]+"
+    _dictionary = {}
+    _text_key_name: str = "sentence"
+    _do_lowercase: bool = True
+    def __init__(
+        self,
+        whitelist: str = None,
+        dictionary: Dict[str, str] = None,
+        lang: str = None
+    ) -> None:
+        self.text_key_name = self._text_key_name
+        self.whitelist = whitelist if whitelist and isinstance(whitelist, str) else self._whitelist
+        self.dictionary = dictionary if dictionary and isinstance(dictionary, dict) else self._dictionary
+        self.do_lowercase = self._do_lowercase
+        self.lang = lang
+    def chars_to_map(self, sentence: str) -> str:
+        """Maps every character, words, and phrase into a proper one.
+        Args:
+            sentence (str): A piece of text.
+        """
+        if not len(self.dictionary) > 0:
+            return sentence
+        pattern = "|".join(map(re.escape, self.dictionary.keys()))
+        return re.sub(pattern, lambda m: self.dictionary[m.group()], str(sentence))
+    def chars_to_preserve(
+        self,
+        sentence: str,
+    ) -> str:
+        """Keeps specified characters from sentence
+        Args:
+            sentence (str): A piece of text.
+        """
+        try:
+            tokenized = re.findall(self.whitelist, sentence, re.IGNORECASE)
+            return " ".join(tokenized)
+        except Exception as error:
+            print(
+                textwrap.dedent(
+                    f"""
+                    Bad characters range {self.whitelist},
+                    {error}
+                    """
+                )
+            )
+            raise
+    def text_level_normalizer(self, sentence: str, *args: Any, **kwargs: Any) -> str:
+        """A text level of normalization.
+        It is handy for some languages that need to add a hierarchy of
+        normalization and filtering at the text level.
+        Args:
+            sentence (str): A piece of text.
+        """
+        text = sentence
+        if not self.lang:
+            return text
+        _text = []
+        for word in text.split():
+            try:
+                word = int(word)
+                word = str(num2words(word, lang=self.lang))
+            except:
+                word = str(word)
+            _text.append(word)
+        return " ".join(_text)
+    def __call__(
+        self,
+        batch: Dict,
+        return_dict: bool = True,
+        do_lastspace_removing: bool = False,
+        text_key_name: Optional[str] = None,
+        do_lowercase: Optional[bool] = None,
+        *args: Any,
+        **kwargs: Any,
+    ) -> Any:
+        """Normalization caller
+        Args:
+            batch (Dict): A batch of input.
+            text_key_name (str, optional): The key name of text in the batch input.
+            return_dict (bool, optional): Whether to return dictionary of batch or not just the text. Defaults to True.
+            do_lastspace_removing (bool, optional): Whether to add extra space at the end of text or not. Defaults to True.
+            do_lowercase (bool, optional): Whether to do lowercase or not. Defaults to None.
+        """
+        text_key_name = text_key_name if text_key_name else self.text_key_name
+        do_lowercase = do_lowercase if isinstance(do_lowercase, bool) else self.do_lowercase
+        if text_key_name not in batch:
+            raise KeyError(
+                textwrap.dedent(
+                    f"""
+                    keyname {text_key_name} not existed in the batch dictionary,
+                    the batch dictionary consists of the following keys {list(batch.keys())},
+                    you can easily add a new keyname by passing the `text_key_name` into Normalizer.
+                    """
+                )
+            )
+        text = batch[text_key_name].strip()
+        if do_lowercase:
+            text = text.lower()
+        text = self.chars_to_map(text)
+        text = self.chars_to_preserve(text)
+        text = self.text_level_normalizer(text, *args, **kwargs)
+        text = text.strip()
+        if not do_lastspace_removing:
+            text = text + " "
+        if not return_dict:
+            return text
+        batch[text_key_name] = text
+        return batch

num2words/__init__.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2003, Taro Ogawa.  All Rights Reserved.
+# Copyright (c) 2013, Savoir-faire Linux inc.  All Rights Reserved.
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA 02110-1301 USA
+from __future__ import unicode_literals
+from . import (
+    lang_IS
+)
+CONVERTER_CLASSES = {
+    'is': lang_IS.Num2Word_IS()
+}
+CONVERTES_TYPES = ['cardinal', 'ordinal', 'ordinal_num', 'year', 'currency']
+def num2words(number, ordinal=False, lang='en', to='cardinal', **kwargs):
+    # We try the full language first
+    if lang not in CONVERTER_CLASSES:
+        # ... and then try only the first 2 letters
+        lang = lang[:2]
+    if lang not in CONVERTER_CLASSES:
+        raise NotImplementedError()
+    converter = CONVERTER_CLASSES[lang]
+    if isinstance(number, str):
+        number = converter.str_to_number(number)
+    # backwards compatible
+    if ordinal:
+        return converter.to_ordinal(number)
+    if to not in CONVERTES_TYPES:
+        raise NotImplementedError()
+    return getattr(converter, 'to_{}'.format(to))(number, **kwargs)

num2words/base.py ADDED Viewed

	@@ -0,0 +1,306 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2003, Taro Ogawa.  All Rights Reserved.
+# Copyright (c) 2013, Savoir-faire Linux inc.  All Rights Reserved.
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA 02110-1301 USA
+from __future__ import unicode_literals
+import math
+from collections import OrderedDict
+from decimal import Decimal
+from .compat import to_s
+from .currency import parse_currency_parts, prefix_currency
+class Num2Word_Base(object):
+    CURRENCY_FORMS = {}
+    CURRENCY_ADJECTIVES = {}
+    def __init__(self):
+        self.is_title = False
+        self.precision = 2
+        self.exclude_title = []
+        self.negword = "(-) "
+        self.pointword = "(.)"
+        self.errmsg_nonnum = "type(%s) not in [long, int, float]"
+        self.errmsg_floatord = "Cannot treat float %s as ordinal."
+        self.errmsg_negord = "Cannot treat negative num %s as ordinal."
+        self.errmsg_toobig = "abs(%s) must be less than %s."
+        self.setup()
+        # uses cards
+        if any(hasattr(self, field) for field in
+               ['high_numwords', 'mid_numwords', 'low_numwords']):
+            self.cards = OrderedDict()
+            self.set_numwords()
+            self.MAXVAL = 1000 * list(self.cards.keys())[0]
+    def set_numwords(self):
+        self.set_high_numwords(self.high_numwords)
+        self.set_mid_numwords(self.mid_numwords)
+        self.set_low_numwords(self.low_numwords)
+    def set_high_numwords(self, *args):
+        raise NotImplementedError
+    def set_mid_numwords(self, mid):
+        for key, val in mid:
+            self.cards[key] = val
+    def set_low_numwords(self, numwords):
+        for word, n in zip(numwords, range(len(numwords) - 1, -1, -1)):
+            self.cards[n] = word
+    def splitnum(self, value):
+        for elem in self.cards:
+            if elem > value:
+                continue
+            out = []
+            if value == 0:
+                div, mod = 1, 0
+            else:
+                div, mod = divmod(value, elem)
+            if div == 1:
+                out.append((self.cards[1], 1))
+            else:
+                if div == value:  # The system tallies, eg Roman Numerals
+                    return [(div * self.cards[elem], div*elem)]
+                out.append(self.splitnum(div))
+            out.append((self.cards[elem], elem))
+            if mod:
+                out.append(self.splitnum(mod))
+            return out
+    def parse_minus(self, num_str):
+        """Detach minus and return it as symbol with new num_str."""
+        if num_str.startswith('-'):
+            # Extra spacing to compensate if there is no minus.
+            return '%s ' % self.negword, num_str[1:]
+        return '', num_str
+    def str_to_number(self, value):
+        return Decimal(value)
+    def to_cardinal(self, value):
+        try:
+            assert int(value) == value
+        except (ValueError, TypeError, AssertionError):
+            return self.to_cardinal_float(value)
+        out = ""
+        if value < 0:
+            value = abs(value)
+            out = self.negword
+        if value >= self.MAXVAL:
+            raise OverflowError(self.errmsg_toobig % (value, self.MAXVAL))
+        val = self.splitnum(value)
+        words, num = self.clean(val)
+        return self.title(out + words)
+    def float2tuple(self, value):
+        pre = int(value)
+        # Simple way of finding decimal places to update the precision
+        self.precision = abs(Decimal(str(value)).as_tuple().exponent)
+        post = abs(value - pre) * 10**self.precision
+        if abs(round(post) - post) < 0.01:
+            # We generally floor all values beyond our precision (rather than
+            # rounding), but in cases where we have something like 1.239999999,
+            # which is probably due to python's handling of floats, we actually
+            # want to consider it as 1.24 instead of 1.23
+            post = int(round(post))
+        else:
+            post = int(math.floor(post))
+        return pre, post
+    def to_cardinal_float(self, value):
+        try:
+            float(value) == value
+        except (ValueError, TypeError, AssertionError, AttributeError):
+            raise TypeError(self.errmsg_nonnum % value)
+        pre, post = self.float2tuple(float(value))
+        post = str(post)
+        post = '0' * (self.precision - len(post)) + post
+        out = [self.to_cardinal(pre)]
+        if self.precision:
+            out.append(self.title(self.pointword))
+        for i in range(self.precision):
+            curr = int(post[i])
+            out.append(to_s(self.to_cardinal(curr)))
+        return " ".join(out)
+    def merge(self, curr, next):
+        raise NotImplementedError
+    def clean(self, val):
+        out = val
+        while len(val) != 1:
+            out = []
+            left, right = val[:2]
+            if isinstance(left, tuple) and isinstance(right, tuple):
+                out.append(self.merge(left, right))
+                if val[2:]:
+                    out.append(val[2:])
+            else:
+                for elem in val:
+                    if isinstance(elem, list):
+                        if len(elem) == 1:
+                            out.append(elem[0])
+                        else:
+                            out.append(self.clean(elem))
+                    else:
+                        out.append(elem)
+            val = out
+        return out[0]
+    def title(self, value):
+        if self.is_title:
+            out = []
+            value = value.split()
+            for word in value:
+                if word in self.exclude_title:
+                    out.append(word)
+                else:
+                    out.append(word[0].upper() + word[1:])
+            value = " ".join(out)
+        return value
+    def verify_ordinal(self, value):
+        if not value == int(value):
+            raise TypeError(self.errmsg_floatord % value)
+        if not abs(value) == value:
+            raise TypeError(self.errmsg_negord % value)
+    def to_ordinal(self, value):
+        return self.to_cardinal(value)
+    def to_ordinal_num(self, value):
+        return value
+    # Trivial version
+    def inflect(self, value, text):
+        text = text.split("/")
+        if value == 1:
+            return text[0]
+        return "".join(text)
+    # //CHECK: generalise? Any others like pounds/shillings/pence?
+    def to_splitnum(self, val, hightxt="", lowtxt="", jointxt="",
+                    divisor=100, longval=True, cents=True):
+        out = []
+        if isinstance(val, float):
+            high, low = self.float2tuple(val)
+        else:
+            try:
+                high, low = val
+            except TypeError:
+                high, low = divmod(val, divisor)
+        if high:
+            hightxt = self.title(self.inflect(high, hightxt))
+            out.append(self.to_cardinal(high))
+            if low:
+                if longval:
+                    if hightxt:
+                        out.append(hightxt)
+                    if jointxt:
+                        out.append(self.title(jointxt))
+            elif hightxt:
+                out.append(hightxt)
+        if low:
+            if cents:
+                out.append(self.to_cardinal(low))
+            else:
+                out.append("%02d" % low)
+            if lowtxt and longval:
+                out.append(self.title(self.inflect(low, lowtxt)))
+        return " ".join(out)
+    def to_year(self, value, **kwargs):
+        return self.to_cardinal(value)
+    def pluralize(self, n, forms):
+        """
+        Should resolve gettext form:
+        http://docs.translatehouse.org/projects/localization-guide/en/latest/l10n/pluralforms.html
+        """
+        raise NotImplementedError
+    def _cents_verbose(self, number, currency):
+        return self.to_cardinal(number)
+    def _cents_terse(self, number, currency):
+        return "%02d" % number
+    def to_currency(self, val, currency='EUR', cents=True, separator=',',
+                    adjective=False):
+        """
+        Args:
+            val: Numeric value
+            currency (str): Currency code
+            cents (bool): Verbose cents
+            separator (str): Cent separator
+            adjective (bool): Prefix currency name with adjective
+        Returns:
+            str: Formatted string
+        """
+        left, right, is_negative = parse_currency_parts(val)
+        try:
+            cr1, cr2 = self.CURRENCY_FORMS[currency]
+        except KeyError:
+            raise NotImplementedError(
+                'Currency code "%s" not implemented for "%s"' %
+                (currency, self.__class__.__name__))
+        if adjective and currency in self.CURRENCY_ADJECTIVES:
+            cr1 = prefix_currency(self.CURRENCY_ADJECTIVES[currency], cr1)
+        minus_str = "%s " % self.negword if is_negative else ""
+        cents_str = self._cents_verbose(right, currency) \
+            if cents else self._cents_terse(right, currency)
+        return u'%s%s %s%s %s %s' % (
+            minus_str,
+            self.to_cardinal(left),
+            self.pluralize(left, cr1),
+            separator,
+            cents_str,
+            self.pluralize(right, cr2)
+        )
+    def setup(self):
+        pass

num2words/compat.py ADDED Viewed

	@@ -0,0 +1,29 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2003, Taro Ogawa.  All Rights Reserved.
+# Copyright (c) 2013, Savoir-faire Linux inc.  All Rights Reserved.
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA 02110-1301 USA
+try:
+    strtype = basestring
+except NameError:
+    strtype = str
+def to_s(val):
+    try:
+        return unicode(val)
+    except NameError:
+        return str(val)

num2words/currency.py ADDED Viewed

	@@ -0,0 +1,50 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2003, Taro Ogawa.  All Rights Reserved.
+# Copyright (c) 2013, Savoir-faire Linux inc.  All Rights Reserved.
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA 02110-1301 USA
+from __future__ import division
+from decimal import ROUND_HALF_UP, Decimal
+def parse_currency_parts(value, is_int_with_cents=True):
+    if isinstance(value, int):
+        if is_int_with_cents:
+            # assume cents if value is integer
+            negative = value < 0
+            value = abs(value)
+            integer, cents = divmod(value, 100)
+        else:
+            negative = value < 0
+            integer, cents = abs(value), 0
+    else:
+        value = Decimal(value)
+        value = value.quantize(
+            Decimal('.01'),
+            rounding=ROUND_HALF_UP
+        )
+        negative = value < 0
+        value = abs(value)
+        integer, fraction = divmod(value, 1)
+        integer = int(integer)
+        cents = int(fraction * 100)
+    return integer, cents, negative
+def prefix_currency(prefix, base):
+    return tuple("%s %s" % (prefix, i) for i in base)

num2words/lang_EU.py ADDED Viewed

	@@ -0,0 +1,93 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2003, Taro Ogawa.  All Rights Reserved.
+# Copyright (c) 2013, Savoir-faire Linux inc.  All Rights Reserved.
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA 02110-1301 USA
+from __future__ import unicode_literals
+from .base import Num2Word_Base
+GENERIC_DOLLARS = ('dollar', 'dollars')
+GENERIC_CENTS = ('cent', 'cents')
+class Num2Word_EU(Num2Word_Base):
+    CURRENCY_FORMS = {
+        'AUD': (GENERIC_DOLLARS, GENERIC_CENTS),
+        'CAD': (GENERIC_DOLLARS, GENERIC_CENTS),
+        # repalced by EUR
+        'EEK': (('kroon', 'kroons'), ('sent', 'senti')),
+        'EUR': (('euro', 'euro'), GENERIC_CENTS),
+        'GBP': (('pound sterling', 'pounds sterling'), ('penny', 'pence')),
+        # replaced by EUR
+        'LTL': (('litas', 'litas'), GENERIC_CENTS),
+        # replaced by EUR
+        'LVL': (('lat', 'lats'), ('santim', 'santims')),
+        'USD': (GENERIC_DOLLARS, GENERIC_CENTS),
+        'RUB': (('rouble', 'roubles'), ('kopek', 'kopeks')),
+        'SEK': (('krona', 'kronor'), ('öre', 'öre')),
+        'NOK': (('krone', 'kroner'), ('øre', 'øre')),
+        'PLN': (('zloty', 'zlotys', 'zlotu'), ('grosz', 'groszy')),
+        'MXN': (('peso', 'pesos'), GENERIC_CENTS),
+        'RON': (('leu', 'lei', 'de lei'), ('ban', 'bani', 'de bani')),
+        'INR': (('rupee', 'rupees'), ('paisa', 'paise')),
+        'HUF': (('forint', 'forint'), ('fillér', 'fillér')),
+        'ISK': (('króna', 'krónur'), ('aur', 'aurar')),
+    }
+    CURRENCY_ADJECTIVES = {
+        'AUD': 'Australian',
+        'CAD': 'Canadian',
+        'EEK': 'Estonian',
+        'USD': 'US',
+        'RUB': 'Russian',
+        'NOK': 'Norwegian',
+        'MXN': 'Mexican',
+        'RON': 'Romanian',
+        'INR': 'Indian',
+        'HUF': 'Hungarian',
+        'ISK': 'íslenskar',
+    }
+    GIGA_SUFFIX = "illiard"
+    MEGA_SUFFIX = "illion"
+    def set_high_numwords(self, high):
+        cap = 3 + 6 * len(high)
+        for word, n in zip(high, range(cap, 3, -6)):
+            if self.GIGA_SUFFIX:
+                self.cards[10 ** n] = word + self.GIGA_SUFFIX
+            if self.MEGA_SUFFIX:
+                self.cards[10 ** (n - 3)] = word + self.MEGA_SUFFIX
+    def gen_high_numwords(self, units, tens, lows):
+        out = [u + t for t in tens for u in units]
+        out.reverse()
+        return out + lows
+    def pluralize(self, n, forms):
+        form = 0 if n == 1 else 1
+        return forms[form]
+    def setup(self):
+        lows = ["non", "oct", "sept", "sext", "quint", "quadr", "tr", "b", "m"]
+        units = ["", "un", "duo", "tre", "quattuor", "quin", "sex", "sept",
+                 "octo", "novem"]
+        tens = ["dec", "vigint", "trigint", "quadragint", "quinquagint",
+                "sexagint", "septuagint", "octogint", "nonagint"]
+        self.high_numwords = ["cent"] + self.gen_high_numwords(units, tens,
+                                                               lows)

num2words/lang_IS.py ADDED Viewed

	@@ -0,0 +1,128 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2003, Taro Ogawa.  All Rights Reserved.
+# Copyright (c) 2013, Savoir-faire Linux inc.  All Rights Reserved.
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA 02110-1301 USA
+from __future__ import division, print_function, unicode_literals
+from . import lang_EU
+# Genders
+KK = 0  # Karlkyn (male)
+KVK = 1  # Kvenkyn (female)
+HK = 2  # Hvorugkyn (neuter)
+GENDERS = {
+    "einn": ("einn", "ein", "eitt"),
+    "tveir": ("tveir", "tvær", "tvö"),
+    "þrír": ("þrír", "þrjár", "þrjú"),
+    "fjórir": ("fjórir", "fjórar", "fjögur"),
+}
+PLURALS = {
+    "hundrað": ("hundrað", "hundruð"),
+}
+class Num2Word_IS(lang_EU.Num2Word_EU):
+    GIGA_SUFFIX = "illjarður"
+    MEGA_SUFFIX = "illjón"
+    def setup(self):
+        lows = ["okt", "sept", "sext", "kvint", "kvaðr", "tr", "b", "m"]
+        self.high_numwords = self.gen_high_numwords([], [], lows)
+        self.negword = "mínus "
+        self.pointword = "komma"
+        # All words should be excluded, title case is not used in Icelandic
+        self.exclude_title = ["og", "komma", "mínus"]
+        self.mid_numwords = [(1000, "þúsund"), (100, "hundrað"),
+                             (90, "níutíu"), (80, "áttatíu"), (70, "sjötíu"),
+                             (60, "sextíu"), (50, "fimmtíu"), (40, "fjörutíu"),
+                             (30, "þrjátíu")]
+        self.low_numwords = ["tuttugu", "nítján", "átján", "sautján",
+                             "sextán", "fimmtán", "fjórtán", "þrettán",
+                             "tólf", "ellefu", "tíu", "níu", "átta",
+                             "sjö", "sex", "fimm", "fjórir", "þrír",
+                             "tveir", "einn", "núll"]
+        self.ords = {"einn": "fyrsti",
+                     "tveir": "annar",
+                     "þrír": "þriðji",
+                     "fjórir": "fjórði",
+                     "fimm": "fimmti",
+                     "sex": "sjötti",
+                     "sjö": "sjöundi",
+                     "átta": "áttundi",
+                     "níu": "níundi",
+                     "tíu": "tíundi",
+                     "ellefu": "ellefti",
+                     "tólf": "tólfti"}
+    def pluralize(self, n, noun):
+        form = 0 if (n % 10 == 1 and n % 100 != 11) else 1
+        if form == 0:
+            return noun
+        elif self.GIGA_SUFFIX in noun:
+            return noun.replace(self.GIGA_SUFFIX, "illjarðar")
+        elif self.MEGA_SUFFIX in noun:
+            return noun.replace(self.MEGA_SUFFIX, "illjónir")
+        elif noun not in PLURALS:
+            return noun
+        return PLURALS[noun][form]
+    def genderize(self, adj, noun):
+        last = adj.split()[-1]
+        if last not in GENDERS:
+            return adj
+        gender = KK
+        if "hund" in noun or "þús" in noun:
+            gender = HK
+        elif "illjarð" in noun:
+            gender = KK
+        elif "illjón" in noun:
+            gender = KVK
+        return adj.replace(last, GENDERS[last][gender])
+    def merge(self, lpair, rpair):
+        ltext, lnum = lpair
+        rtext, rnum = rpair
+        if lnum == 1 and rnum < 100:
+            return (rtext, rnum)
+        elif lnum < rnum:
+            rtext = self.pluralize(lnum, rtext)
+            ltext = self.genderize(ltext, rtext)
+            return ("%s %s" % (ltext, rtext), lnum * rnum)
+        elif lnum > rnum and rnum in self.cards:
+            rtext = self.pluralize(lnum, rtext)
+            ltext = self.genderize(ltext, rtext)
+            return ("%s og %s" % (ltext, rtext), lnum + rnum)
+        return ("%s %s" % (ltext, rtext), lnum + rnum)
+    def to_ordinal(self, value):
+        raise NotImplementedError
+    def to_ordinal_num(self, value):
+        raise NotImplementedError
+    def to_year(self, val, suffix=None, longval=True):
+        raise NotImplementedError
+    def to_currency(self, val, longval=True):
+        raise NotImplementedError

num2words/utils.py ADDED Viewed

	@@ -0,0 +1,35 @@

+# -*- coding: utf-8 -*-
+# Copyright (c) 2003, Taro Ogawa.  All Rights Reserved.
+# Copyright (c) 2013, Savoir-faire Linux inc.  All Rights Reserved.
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+# Lesser General Public License for more details.
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
+# MA 02110-1301 USA
+def splitbyx(n, x, format_int=True):
+    length = len(n)
+    if length > x:
+        start = length % x
+        if start > 0:
+            result = n[:start]
+            yield int(result) if format_int else result
+        for i in range(start, length, x):
+            result = n[i:i+x]
+            yield int(result) if format_int else result
+    else:
+        yield int(n) if format_int else n
+def get_digits(n):
+    a = [int(x) for x in reversed(list(('%03d' % n)[-3:]))]
+    return a