Spaces:

skytnt
/

moe-tts

Running on CPU Upgrade

App Files Files Community

skytnt commited on Sep 21, 2022

Commit

e1ac136

1 Parent(s): 668c08a

update model

Browse files

Files changed (13) hide show

README.md +1 -1
requirements.txt +1 -0
saved_model/12/config.json +3 -0
saved_model/12/cover.jpg +3 -0
saved_model/12/model.pth +3 -0
saved_model/6/model.pth +1 -1
saved_model/info.json +2 -2
text/__init__.py +16 -16
text/cleaners.py +46 -454
text/japanese.py +132 -0
text/korean.py +205 -0
text/mandarin.py +171 -0
text/sanskrit.py +62 -0

README.md CHANGED Viewed

@@ -4,7 +4,7 @@ emoji: 😊🎙️
 colorFrom: red
 colorTo: pink
 sdk: gradio
-sdk_version: 3.3
 app_file: app.py
 pinned: false
 license: mit

 colorFrom: red
 colorTo: pink
 sdk: gradio
+sdk_version: 3.3.1
 app_file: app.py
 pinned: false
 license: mit

requirements.txt CHANGED Viewed

@@ -12,6 +12,7 @@ Unidecode
 pyopenjtalk
 jamo
 pypinyin
 jieba
 cn2an
 gradio

 pyopenjtalk
 jamo
 pypinyin
+ko_pron
 jieba
 cn2an
 gradio

saved_model/12/config.json ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:d2d4fe332cfdbe95abc1152d977acdffebe88a28db6830f6c57e1cfb47a2799d
+size 1397

saved_model/12/cover.jpg ADDED Viewed

Git LFS Details

SHA256: a4f93df7045805bcb028b92f464710e10961bae3ce43cddf2c289212673312e2
Pointer size: 130 Bytes
Size of remote file: 41 kB

saved_model/12/model.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cf8761f1f7818c961651d2c0d914821f742a9a1df8841aae376c888289ae5609
+size 158888269

saved_model/6/model.pth CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:b545a33fe870c214e3828da9ab8e756c6c75a30a6acee74670637fbbd3a58a0d
 size 158875981

 version https://git-lfs.github.com/spec/v1
+oid sha256:5461551d900d726e24fe5551c3773c0c27419c9237882fe7d400025344499f85
 size 158875981

saved_model/info.json CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:33b7a2042589990eb609c4e87044b7d5d6d80da206c88f54b70175ce0d2a535c
-size 1616

 version https://git-lfs.github.com/spec/v1
+oid sha256:79bd8092d3351f000c280f51beee907fb0f090efc3bb4571e9ae10c6f7c3ec23
+size 1733

text/__init__.py CHANGED Viewed

@@ -3,30 +3,30 @@ from text import cleaners
 def text_to_sequence(text, symbols, cleaner_names):
-    '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
     Args:
       text: string to convert to a sequence
       cleaner_names: names of the cleaner functions to run the text through
     Returns:
       List of integers corresponding to the symbols in the text
   '''
-    _symbol_to_id = {s: i for i, s in enumerate(symbols)}
-    sequence = []
-    clean_text = _clean_text(text, cleaner_names)
-    for symbol in clean_text:
-        if symbol not in _symbol_to_id.keys():
-            continue
-        symbol_id = _symbol_to_id[symbol]
-        sequence += [symbol_id]
-    return sequence
 def _clean_text(text, cleaner_names):
-    for name in cleaner_names:
-        cleaner = getattr(cleaners, name)
-        if not cleaner:
-            raise Exception('Unknown cleaner: %s' % name)
-        text = cleaner(text)
-    return text

 def text_to_sequence(text, symbols, cleaner_names):
+  '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
     Args:
       text: string to convert to a sequence
       cleaner_names: names of the cleaner functions to run the text through
     Returns:
       List of integers corresponding to the symbols in the text
   '''
+  _symbol_to_id = {s: i for i, s in enumerate(symbols)}
+  sequence = []
+  clean_text = _clean_text(text, cleaner_names)
+  for symbol in clean_text:
+    if symbol not in _symbol_to_id.keys():
+      continue
+    symbol_id = _symbol_to_id[symbol]
+    sequence += [symbol_id]
+  return sequence
 def _clean_text(text, cleaner_names):
+  for name in cleaner_names:
+    cleaner = getattr(cleaners, name)
+    if not cleaner:
+      raise Exception('Unknown cleaner: %s' % name)
+    text = cleaner(text)
+  return text

text/cleaners.py CHANGED Viewed

@@ -1,447 +1,10 @@
-""" from https://github.com/keithito/tacotron """
-'''
-Cleaners are transformations that run over the input text at both training and eval time.
-Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
-hyperparameter. Some cleaners are English-specific. You'll typically want to use:
-  1. "english_cleaners" for English text
-  2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
-     the Unidecode library (https://pypi.python.org/pypi/Unidecode)
-  3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
-     the symbols in symbols.py to match your data).
-'''
-import os
 import re
-import sys
-import cn2an
-import jieba
-import pyopenjtalk
-from jamo import h2j, j2hcj
-from pypinyin import lazy_pinyin, BOPOMOFO
-from unidecode import unidecode
-jieba.initialize()
-pyopenjtalk._lazy_init()
-# This is a list of Korean classifiers preceded by pure Korean numerals.
-_korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
-# Regular expression matching whitespace:
-_whitespace_re = re.compile(r'\s+')
-# Regular expression matching Japanese without punctuation marks:
-_japanese_characters = re.compile(
-    r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
-# Regular expression matching non-Japanese characters or punctuation marks:
-_japanese_marks = re.compile(
-    r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
-# List of (regular expression, replacement) pairs for abbreviations:
-_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
-    ('mrs', 'misess'),
-    ('mr', 'mister'),
-    ('dr', 'doctor'),
-    ('st', 'saint'),
-    ('co', 'company'),
-    ('jr', 'junior'),
-    ('maj', 'major'),
-    ('gen', 'general'),
-    ('drs', 'doctors'),
-    ('rev', 'reverend'),
-    ('lt', 'lieutenant'),
-    ('hon', 'honorable'),
-    ('sgt', 'sergeant'),
-    ('capt', 'captain'),
-    ('esq', 'esquire'),
-    ('ltd', 'limited'),
-    ('col', 'colonel'),
-    ('ft', 'fort'),
-]]
-# List of (symbol, Japanese) pairs for marks:
-_symbols_to_japanese = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
-    ('％', 'パーセント')
-]]
-# List of (hangul, hangul divided) pairs:
-_hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
-    ('ㄳ', 'ㄱㅅ'),
-    ('ㄵ', 'ㄴㅈ'),
-    ('ㄶ', 'ㄴㅎ'),
-    ('ㄺ', 'ㄹㄱ'),
-    ('ㄻ', 'ㄹㅁ'),
-    ('ㄼ', 'ㄹㅂ'),
-    ('ㄽ', 'ㄹㅅ'),
-    ('ㄾ', 'ㄹㅌ'),
-    ('ㄿ', 'ㄹㅍ'),
-    ('ㅀ', 'ㄹㅎ'),
-    ('ㅄ', 'ㅂㅅ'),
-    ('ㅘ', 'ㅗㅏ'),
-    ('ㅙ', 'ㅗㅐ'),
-    ('ㅚ', 'ㅗㅣ'),
-    ('ㅝ', 'ㅜㅓ'),
-    ('ㅞ', 'ㅜㅔ'),
-    ('ㅟ', 'ㅜㅣ'),
-    ('ㅢ', 'ㅡㅣ'),
-    ('ㅑ', 'ㅣㅏ'),
-    ('ㅒ', 'ㅣㅐ'),
-    ('ㅕ', 'ㅣㅓ'),
-    ('ㅖ', 'ㅣㅔ'),
-    ('ㅛ', 'ㅣㅗ'),
-    ('ㅠ', 'ㅣㅜ')
-]]
-# List of (Latin alphabet, hangul) pairs:
-_latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
-    ('a', '에이'),
-    ('b', '비'),
-    ('c', '시'),
-    ('d', '디'),
-    ('e', '이'),
-    ('f', '에프'),
-    ('g', '지'),
-    ('h', '에이치'),
-    ('i', '아이'),
-    ('j', '제이'),
-    ('k', '케이'),
-    ('l', '엘'),
-    ('m', '엠'),
-    ('n', '엔'),
-    ('o', '오'),
-    ('p', '피'),
-    ('q', '큐'),
-    ('r', '아르'),
-    ('s', '에스'),
-    ('t', '티'),
-    ('u', '유'),
-    ('v', '브이'),
-    ('w', '더블유'),
-    ('x', '엑스'),
-    ('y', '와이'),
-    ('z', '제트')
-]]
-# List of (Latin alphabet, bopomofo) pairs:
-_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
-    ('a', 'ㄟˉ'),
-    ('b', 'ㄅㄧˋ'),
-    ('c', 'ㄙㄧˉ'),
-    ('d', 'ㄉㄧˋ'),
-    ('e', 'ㄧˋ'),
-    ('f', 'ㄝˊㄈㄨˋ'),
-    ('g', 'ㄐㄧˋ'),
-    ('h', 'ㄝˇㄑㄩˋ'),
-    ('i', 'ㄞˋ'),
-    ('j', 'ㄐㄟˋ'),
-    ('k', 'ㄎㄟˋ'),
-    ('l', 'ㄝˊㄛˋ'),
-    ('m', 'ㄝˊㄇㄨˋ'),
-    ('n', 'ㄣˉ'),
-    ('o', 'ㄡˉ'),
-    ('p', 'ㄆㄧˉ'),
-    ('q', 'ㄎㄧㄡˉ'),
-    ('r', 'ㄚˋ'),
-    ('s', 'ㄝˊㄙˋ'),
-    ('t', 'ㄊㄧˋ'),
-    ('u', 'ㄧㄡˉ'),
-    ('v', 'ㄨㄧˉ'),
-    ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),
-    ('x', 'ㄝˉㄎㄨˋㄙˋ'),
-    ('y', 'ㄨㄞˋ'),
-    ('z', 'ㄗㄟˋ')
-]]
-# List of (bopomofo, romaji) pairs:
-_bopomofo_to_romaji = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
-    ('ㄅㄛ', 'p⁼wo'),
-    ('ㄆㄛ', 'pʰwo'),
-    ('ㄇㄛ', 'mwo'),
-    ('ㄈㄛ', 'fwo'),
-    ('ㄅ', 'p⁼'),
-    ('ㄆ', 'pʰ'),
-    ('ㄇ', 'm'),
-    ('ㄈ', 'f'),
-    ('ㄉ', 't⁼'),
-    ('ㄊ', 'tʰ'),
-    ('ㄋ', 'n'),
-    ('ㄌ', 'l'),
-    ('ㄍ', 'k⁼'),
-    ('ㄎ', 'kʰ'),
-    ('ㄏ', 'h'),
-    ('ㄐ', 'ʧ⁼'),
-    ('ㄑ', 'ʧʰ'),
-    ('ㄒ', 'ʃ'),
-    ('ㄓ', 'ʦ`⁼'),
-    ('ㄔ', 'ʦ`ʰ'),
-    ('ㄕ', 's`'),
-    ('ㄖ', 'ɹ`'),
-    ('ㄗ', 'ʦ⁼'),
-    ('ㄘ', 'ʦʰ'),
-    ('ㄙ', 's'),
-    ('ㄚ', 'a'),
-    ('ㄛ', 'o'),
-    ('ㄜ', 'ə'),
-    ('ㄝ', 'e'),
-    ('ㄞ', 'ai'),
-    ('ㄟ', 'ei'),
-    ('ㄠ', 'au'),
-    ('ㄡ', 'ou'),
-    ('ㄧㄢ', 'yeNN'),
-    ('ㄢ', 'aNN'),
-    ('ㄧㄣ', 'iNN'),
-    ('ㄣ', 'əNN'),
-    ('ㄤ', 'aNg'),
-    ('ㄧㄥ', 'iNg'),
-    ('ㄨㄥ', 'uNg'),
-    ('ㄩㄥ', 'yuNg'),
-    ('ㄥ', 'əNg'),
-    ('ㄦ', 'əɻ'),
-    ('ㄧ', 'i'),
-    ('ㄨ', 'u'),
-    ('ㄩ', 'ɥ'),
-    ('ˉ', '→'),
-    ('ˊ', '↑'),
-    ('ˇ', '↓↑'),
-    ('ˋ', '↓'),
-    ('˙', ''),
-    ('，', ','),
-    ('。', '.'),
-    ('！', '!'),
-    ('？', '?'),
-    ('—', '-')
-]]
-def expand_abbreviations(text):
-    for regex, replacement in _abbreviations:
-        text = re.sub(regex, replacement, text)
-    return text
-def lowercase(text):
-    return text.lower()
-def collapse_whitespace(text):
-    return re.sub(_whitespace_re, ' ', text)
-def convert_to_ascii(text):
-    return unidecode(text)
-def symbols_to_japanese(text):
-    for regex, replacement in _symbols_to_japanese:
-        text = re.sub(regex, replacement, text)
-    return text
-def japanese_to_romaji_with_accent(text):
-    '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
-    text = symbols_to_japanese(text)
-    sentences = re.split(_japanese_marks, text)
-    marks = re.findall(_japanese_marks, text)
-    text = ''
-    for i, sentence in enumerate(sentences):
-        if re.match(_japanese_characters, sentence):
-            if text != '':
-                text += ' '
-            labels = pyopenjtalk.extract_fullcontext(sentence)
-            for n, label in enumerate(labels):
-                phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
-                if phoneme not in ['sil', 'pau']:
-                    text += phoneme.replace('ch', 'ʧ').replace('sh', 'ʃ').replace('cl', 'Q')
-                else:
-                    continue
-                n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
-                a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
-                a2 = int(re.search(r"\+(\d+)\+", label).group(1))
-                a3 = int(re.search(r"\+(\d+)/", label).group(1))
-                if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil', 'pau']:
-                    a2_next = -1
-                else:
-                    a2_next = int(re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
-                # Accent phrase boundary
-                if a3 == 1 and a2_next == 1:
-                    text += ' '
-                # Falling
-                elif a1 == 0 and a2_next == a2 + 1 and a2 != n_moras:
-                    text += '↓'
-                # Rising
-                elif a2 == 1 and a2_next == 2:
-                    text += '↑'
-        if i < len(marks):
-            text += unidecode(marks[i]).replace(' ', '')
-    return text
-def latin_to_hangul(text):
-    for regex, replacement in _latin_to_hangul:
-        text = re.sub(regex, replacement, text)
-    return text
-def divide_hangul(text):
-    for regex, replacement in _hangul_divided:
-        text = re.sub(regex, replacement, text)
-    return text
-def hangul_number(num, sino=True):
-    '''Reference https://github.com/Kyubyong/g2pK'''
-    num = re.sub(',', '', num)
-    if num == '0':
-        return '영'
-    if not sino and num == '20':
-        return '스무'
-    digits = '123456789'
-    names = '일이삼사오육칠팔구'
-    digit2name = {d: n for d, n in zip(digits, names)}
-    modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉'
-    decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔'
-    digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
-    digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
-    spelledout = []
-    for i, digit in enumerate(num):
-        i = len(num) - i - 1
-        if sino:
-            if i == 0:
-                name = digit2name.get(digit, '')
-            elif i == 1:
-                name = digit2name.get(digit, '') + '십'
-                name = name.replace('일십', '십')
-        else:
-            if i == 0:
-                name = digit2mod.get(digit, '')
-            elif i == 1:
-                name = digit2dec.get(digit, '')
-        if digit == '0':
-            if i % 4 == 0:
-                last_three = spelledout[-min(3, len(spelledout)):]
-                if ''.join(last_three) == '':
-                    spelledout.append('')
-                    continue
-            else:
-                spelledout.append('')
-                continue
-        if i == 2:
-            name = digit2name.get(digit, '') + '백'
-            name = name.replace('일백', '백')
-        elif i == 3:
-            name = digit2name.get(digit, '') + '천'
-            name = name.replace('일천', '천')
-        elif i == 4:
-            name = digit2name.get(digit, '') + '만'
-            name = name.replace('일만', '만')
-        elif i == 5:
-            name = digit2name.get(digit, '') + '십'
-            name = name.replace('일십', '십')
-        elif i == 6:
-            name = digit2name.get(digit, '') + '백'
-            name = name.replace('일백', '백')
-        elif i == 7:
-            name = digit2name.get(digit, '') + '천'
-            name = name.replace('일천', '천')
-        elif i == 8:
-            name = digit2name.get(digit, '') + '억'
-        elif i == 9:
-            name = digit2name.get(digit, '') + '십'
-        elif i == 10:
-            name = digit2name.get(digit, '') + '백'
-        elif i == 11:
-            name = digit2name.get(digit, '') + '천'
-        elif i == 12:
-            name = digit2name.get(digit, '') + '조'
-        elif i == 13:
-            name = digit2name.get(digit, '') + '십'
-        elif i == 14:
-            name = digit2name.get(digit, '') + '백'
-        elif i == 15:
-            name = digit2name.get(digit, '') + '천'
-        spelledout.append(name)
-    return ''.join(elem for elem in spelledout)
-def number_to_hangul(text):
-    '''Reference https://github.com/Kyubyong/g2pK'''
-    tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
-    for token in tokens:
-        num, classifier = token
-        if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
-            spelledout = hangul_number(num, sino=False)
-        else:
-            spelledout = hangul_number(num, sino=True)
-        text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
-    # digit by digit for remaining digits
-    digits = '0123456789'
-    names = '영일이삼사오육칠팔구'
-    for d, n in zip(digits, names):
-        text = text.replace(d, n)
-    return text
-def number_to_chinese(text):
-    numbers = re.findall(r'\d+(?:\.?\d+)?', text)
-    for number in numbers:
-        text = text.replace(number, cn2an.an2cn(number), 1)
-    return text
-def chinese_to_bopomofo(text):
-    text = text.replace('、', '，').replace('；', '，').replace('：', '，')
-    words = jieba.lcut(text, cut_all=False)
-    text = ''
-    for word in words:
-        bopomofos = lazy_pinyin(word, BOPOMOFO)
-        if not re.search('[\u4e00-\u9fff]', word):
-            text += word
-            continue
-        for i in range(len(bopomofos)):
-            if re.match('[\u3105-\u3129]', bopomofos[i][-1]):
-                bopomofos[i] += 'ˉ'
-        if text != '':
-            text += ' '
-        text += ''.join(bopomofos)
-    return text
-def latin_to_bopomofo(text):
-    for regex, replacement in _latin_to_bopomofo:
-        text = re.sub(regex, replacement, text)
-    return text
-def bopomofo_to_romaji(text):
-    for regex, replacement in _bopomofo_to_romaji:
-        text = re.sub(regex, replacement, text)
-    return text
-def basic_cleaners(text):
-    '''Basic pipeline that lowercases and collapses whitespace without transliteration.'''
-    text = lowercase(text)
-    text = collapse_whitespace(text)
-    return text
-def transliteration_cleaners(text):
-    '''Pipeline for non-English text that transliterates to ASCII.'''
-    text = convert_to_ascii(text)
-    text = lowercase(text)
-    text = collapse_whitespace(text)
-    return text
 def japanese_cleaners(text):
     text = japanese_to_romaji_with_accent(text)
-    if len(text) > 0 and re.match('[A-Za-z]', text[-1]):
         text += '.'
     return text
@@ -452,44 +15,73 @@ def japanese_cleaners2(text):
 def korean_cleaners(text):
     '''Pipeline for Korean text'''
     text = latin_to_hangul(text)
     text = number_to_hangul(text)
-    text = j2hcj(h2j(text))
     text = divide_hangul(text)
-    if len(text) > 0 and re.match('[\u3131-\u3163]', text[-1]):
         text += '.'
     return text
 def chinese_cleaners(text):
     '''Pipeline for Chinese text'''
     text = number_to_chinese(text)
     text = chinese_to_bopomofo(text)
     text = latin_to_bopomofo(text)
-    if len(text) > 0 and re.match('[ˉˊˇˋ˙]', text[-1]):
         text += '。'
     return text
 def zh_ja_mixture_cleaners(text):
     chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
     japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
     for chinese_text in chinese_texts:
-        cleaned_text = number_to_chinese(chinese_text[4:-4])
-        cleaned_text = chinese_to_bopomofo(cleaned_text)
-        cleaned_text = latin_to_bopomofo(cleaned_text)
-        cleaned_text = bopomofo_to_romaji(cleaned_text)
-        cleaned_text = re.sub('i[aoe]', lambda x: 'y' + x.group(0)[1:], cleaned_text)
-        cleaned_text = re.sub('u[aoəe]', lambda x: 'w' + x.group(0)[1:], cleaned_text)
-        cleaned_text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑]+)', lambda x: x.group(1) + 'ɹ`' + x.group(2), cleaned_text).replace(
-            'ɻ', 'ɹ`')
-        cleaned_text = re.sub('([ʦs][⁼ʰ]?)([→↓↑]+)', lambda x: x.group(1) + 'ɹ' + x.group(2), cleaned_text)
         text = text.replace(chinese_text, cleaned_text + ' ', 1)
     for japanese_text in japanese_texts:
-        cleaned_text = japanese_to_romaji_with_accent(japanese_text[4:-4]).replace('ts', 'ʦ').replace('u', 'ɯ').replace(
-            '...', '…')
         text = text.replace(japanese_text, cleaned_text + ' ', 1)
     text = text[:-1]
-    if len(text) > 0 and re.match('[A-Za-zɯɹəɥ→↓↑]', text[-1]):
         text += '.'
     return text

 import re
 def japanese_cleaners(text):
+    from text.japanese import japanese_to_romaji_with_accent
     text = japanese_to_romaji_with_accent(text)
+    if len(text) == 0 or re.match('[A-Za-z]', text[-1]):
         text += '.'
     return text
 def korean_cleaners(text):
     '''Pipeline for Korean text'''
+    from text.korean import latin_to_hangul, number_to_hangul, divide_hangul
     text = latin_to_hangul(text)
     text = number_to_hangul(text)
     text = divide_hangul(text)
+    if len(text) == 0 or re.match('[\u3131-\u3163]', text[-1]):
         text += '.'
     return text
 def chinese_cleaners(text):
     '''Pipeline for Chinese text'''
+    from text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo
     text = number_to_chinese(text)
     text = chinese_to_bopomofo(text)
     text = latin_to_bopomofo(text)
+    if len(text) == 0 or re.match('[ˉˊˇˋ˙]', text[-1]):
         text += '。'
     return text
 def zh_ja_mixture_cleaners(text):
+    from text.mandarin import chinese_to_romaji
+    from text.japanese import japanese_to_romaji_with_accent
+    chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
+    japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
+    for chinese_text in chinese_texts:
+        cleaned_text = chinese_to_romaji(chinese_text[4:-4])
+        text = text.replace(chinese_text, cleaned_text + ' ', 1)
+    for japanese_text in japanese_texts:
+        cleaned_text = japanese_to_romaji_with_accent(
+            japanese_text[4:-4]).replace('ts', 'ʦ').replace('u', 'ɯ').replace('...', '…')
+        text = text.replace(japanese_text, cleaned_text + ' ', 1)
+    text = text[:-1]
+    if len(text) == 0 or re.match('[A-Za-zɯɹəɥ→↓↑]', text[-1]):
+        text += '.'
+    return text
+def sanskrit_cleaners(text):
+    text = text.replace('॥', '।').replace('ॐ', 'ओम्')
+    if len(text) == 0 or text[-1] != '।':
+        text += ' ।'
+    return text
+def cjks_cleaners(text):
+    from text.mandarin import chinese_to_lazy_ipa
+    from text.japanese import japanese_to_ipa
+    from text.korean import korean_to_lazy_ipa
+    from text.sanskrit import devanagari_to_ipa
     chinese_texts = re.findall(r'\[ZH\].*?\[ZH\]', text)
     japanese_texts = re.findall(r'\[JA\].*?\[JA\]', text)
+    korean_texts = re.findall(r'\[KO\].*?\[KO\]', text)
+    sanskrit_texts = re.findall(r'\[SA\].*?\[SA\]', text)
     for chinese_text in chinese_texts:
+        cleaned_text = chinese_to_lazy_ipa(chinese_text[4:-4])
         text = text.replace(chinese_text, cleaned_text + ' ', 1)
     for japanese_text in japanese_texts:
+        cleaned_text = japanese_to_ipa(japanese_text[4:-4])
         text = text.replace(japanese_text, cleaned_text + ' ', 1)
+    for korean_text in korean_texts:
+        cleaned_text = korean_to_lazy_ipa(korean_text[4:-4])
+        text = text.replace(korean_text, cleaned_text + ' ', 1)
+    for sanskrit_text in sanskrit_texts:
+        cleaned_text = devanagari_to_ipa(sanskrit_text[4:-4])
+        text = text.replace(sanskrit_text, cleaned_text + ' ', 1)
     text = text[:-1]
+    if len(text) == 0 or re.match(r'[^\.,!\?\-…~]', text[-1]):
         text += '.'
     return text

text/japanese.py ADDED Viewed

	@@ -0,0 +1,132 @@

+import re
+from unidecode import unidecode
+import pyopenjtalk
+# Regular expression matching Japanese without punctuation marks:
+_japanese_characters = re.compile(
+    r'[A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
+# Regular expression matching non-Japanese characters or punctuation marks:
+_japanese_marks = re.compile(
+    r'[^A-Za-z\d\u3005\u3040-\u30ff\u4e00-\u9fff\uff11-\uff19\uff21-\uff3a\uff41-\uff5a\uff66-\uff9d]')
+# List of (symbol, Japanese) pairs for marks:
+_symbols_to_japanese = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('％', 'パーセント')
+]]
+# List of (romaji, ipa) pairs for marks:
+_romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('ts', 'ʦ'),
+    ('u', 'ɯ'),
+    ('...', '…'),
+    ('j', 'ʥ'),
+    ('y', 'j'),
+    ('ni', 'n^i'),
+    ('nj', 'n^'),
+    ('hi', 'çi'),
+    ('hj', 'ç'),
+    ('f', 'ɸ'),
+    ('I', 'i*'),
+    ('U', 'ɯ*'),
+    ('r', 'ɾ')
+]]
+# Dictinary of (consonant, sokuon) pairs:
+_real_sokuon = {
+  'k': 'k#',
+  'g': 'k#',
+  't': 't#',
+  'd': 't#',
+  'ʦ': 't#',
+  'ʧ': 't#',
+  'ʥ': 't#',
+  'j': 't#',
+  's': 's',
+  'ʃ': 's',
+  'p': 'p#',
+  'b': 'p#'
+}
+# Dictinary of (consonant, hatsuon) pairs:
+_real_hatsuon = {
+  'p': 'm',
+  'b': 'm',
+  'm': 'm',
+  't': 'n',
+  'd': 'n',
+  'n': 'n',
+  'ʧ': 'n^',
+  'ʥ': 'n^',
+  'k': 'ŋ',
+  'g': 'ŋ'
+}
+def symbols_to_japanese(text):
+    for regex, replacement in _symbols_to_japanese:
+        text = re.sub(regex, replacement, text)
+    return text
+def japanese_to_romaji_with_accent(text):
+    '''Reference https://r9y9.github.io/ttslearn/latest/notebooks/ch10_Recipe-Tacotron.html'''
+    text = symbols_to_japanese(text)
+    sentences = re.split(_japanese_marks, text)
+    marks = re.findall(_japanese_marks, text)
+    text = ''
+    for i, sentence in enumerate(sentences):
+        if re.match(_japanese_characters, sentence):
+            if text != '':
+                text += ' '
+            labels = pyopenjtalk.extract_fullcontext(sentence)
+            for n, label in enumerate(labels):
+                phoneme = re.search(r'\-([^\+]*)\+', label).group(1)
+                if phoneme not in ['sil', 'pau']:
+                    text += phoneme.replace('ch', 'ʧ').replace('sh',
+                                                               'ʃ').replace('cl', 'Q')
+                else:
+                    continue
+                # n_moras = int(re.search(r'/F:(\d+)_', label).group(1))
+                a1 = int(re.search(r"/A:(\-?[0-9]+)\+", label).group(1))
+                a2 = int(re.search(r"\+(\d+)\+", label).group(1))
+                a3 = int(re.search(r"\+(\d+)/", label).group(1))
+                if re.search(r'\-([^\+]*)\+', labels[n + 1]).group(1) in ['sil', 'pau']:
+                    a2_next = -1
+                else:
+                    a2_next = int(
+                        re.search(r"\+(\d+)\+", labels[n + 1]).group(1))
+                # Accent phrase boundary
+                if a3 == 1 and a2_next == 1:
+                    text += ' '
+                # Falling
+                elif a1 == 0 and a2_next == a2 + 1:
+                    text += '↓'
+                # Rising
+                elif a2 == 1 and a2_next == 2:
+                    text += '↑'
+        if i < len(marks):
+            text += unidecode(marks[i]).replace(' ', '')
+    return text
+def get_real_sokuon(text):
+  text=re.sub('Q[↑↓]*(.)',lambda x:_real_sokuon[x.group(1)]+x.group(0)[1:] if x.group(1) in _real_sokuon.keys() else x.group(0),text)
+  return text
+def get_real_hatsuon(text):
+  text=re.sub('N[↑↓]*(.)',lambda x:_real_hatsuon[x.group(1)]+x.group(0)[1:] if x.group(1) in _real_hatsuon.keys() else x.group(0),text)
+  return text
+def japanese_to_ipa(text):
+    text=japanese_to_romaji_with_accent(text)
+    for regex, replacement in _romaji_to_ipa:
+        text = re.sub(regex, replacement, text)
+    text = re.sub(
+            r'([A-Za-zɯ])\1+', lambda x: x.group(0)[0]+'ː'*(len(x.group(0))-1), text)
+    text = get_real_sokuon(text)
+    text = get_real_hatsuon(text)
+    return text

text/korean.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import re
+from jamo import h2j, j2hcj
+import ko_pron
+# This is a list of Korean classifiers preceded by pure Korean numerals.
+_korean_classifiers = '군데 권 개 그루 닢 대 두 마리 모 모금 뭇 발 발짝 방 번 벌 보루 살 수 술 시 쌈 움큼 정 짝 채 척 첩 축 켤레 톨 통'
+# List of (hangul, hangul divided) pairs:
+_hangul_divided = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('ㄳ', 'ㄱㅅ'),
+    ('ㄵ', 'ㄴㅈ'),
+    ('ㄶ', 'ㄴㅎ'),
+    ('ㄺ', 'ㄹㄱ'),
+    ('ㄻ', 'ㄹㅁ'),
+    ('ㄼ', 'ㄹㅂ'),
+    ('ㄽ', 'ㄹㅅ'),
+    ('ㄾ', 'ㄹㅌ'),
+    ('ㄿ', 'ㄹㅍ'),
+    ('ㅀ', 'ㄹㅎ'),
+    ('ㅄ', 'ㅂㅅ'),
+    ('ㅘ', 'ㅗㅏ'),
+    ('ㅙ', 'ㅗㅐ'),
+    ('ㅚ', 'ㅗㅣ'),
+    ('ㅝ', 'ㅜㅓ'),
+    ('ㅞ', 'ㅜㅔ'),
+    ('ㅟ', 'ㅜㅣ'),
+    ('ㅢ', 'ㅡㅣ'),
+    ('ㅑ', 'ㅣㅏ'),
+    ('ㅒ', 'ㅣㅐ'),
+    ('ㅕ', 'ㅣㅓ'),
+    ('ㅖ', 'ㅣㅔ'),
+    ('ㅛ', 'ㅣㅗ'),
+    ('ㅠ', 'ㅣㅜ')
+]]
+# List of (Latin alphabet, hangul) pairs:
+_latin_to_hangul = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('a', '에이'),
+    ('b', '비'),
+    ('c', '시'),
+    ('d', '디'),
+    ('e', '이'),
+    ('f', '에프'),
+    ('g', '지'),
+    ('h', '에이치'),
+    ('i', '아이'),
+    ('j', '제이'),
+    ('k', '케이'),
+    ('l', '엘'),
+    ('m', '엠'),
+    ('n', '엔'),
+    ('o', '오'),
+    ('p', '피'),
+    ('q', '큐'),
+    ('r', '아르'),
+    ('s', '에스'),
+    ('t', '티'),
+    ('u', '유'),
+    ('v', '브이'),
+    ('w', '더블유'),
+    ('x', '엑스'),
+    ('y', '와이'),
+    ('z', '제트')
+]]
+# List of (ipa, lazy ipa) pairs:
+_ipa_to_lazy_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('t͡ɕ','ʧ'),
+    ('d͡ʑ','ʥ'),
+    ('ɲ','n^'),
+    ('ɕ','ʃ'),
+    ('ʷ','w'),
+    ('ɭ','l`'),
+    ('ʎ','ɾ'),
+    ('ɣ','ŋ'),
+    ('ɰ','ɯ'),
+    ('ʝ','j'),
+    ('ʌ','ə'),
+    ('ɡ','g'),
+    ('\u031a','#'),
+    ('\u0348','='),
+    ('\u031e',''),
+    ('\u0320',''),
+    ('\u0339','')
+]]
+def latin_to_hangul(text):
+    for regex, replacement in _latin_to_hangul:
+        text = re.sub(regex, replacement, text)
+    return text
+def divide_hangul(text):
+    text = j2hcj(h2j(text))
+    for regex, replacement in _hangul_divided:
+        text = re.sub(regex, replacement, text)
+    return text
+def hangul_number(num, sino=True):
+    '''Reference https://github.com/Kyubyong/g2pK'''
+    num = re.sub(',', '', num)
+    if num == '0':
+        return '영'
+    if not sino and num == '20':
+        return '스무'
+    digits = '123456789'
+    names = '일이삼사오육칠팔구'
+    digit2name = {d: n for d, n in zip(digits, names)}
+    modifiers = '한 두 세 네 다섯 여섯 일곱 여덟 아홉'
+    decimals = '열 스물 서른 마흔 쉰 예순 일흔 여든 아흔'
+    digit2mod = {d: mod for d, mod in zip(digits, modifiers.split())}
+    digit2dec = {d: dec for d, dec in zip(digits, decimals.split())}
+    spelledout = []
+    for i, digit in enumerate(num):
+        i = len(num) - i - 1
+        if sino:
+            if i == 0:
+                name = digit2name.get(digit, '')
+            elif i == 1:
+                name = digit2name.get(digit, '') + '십'
+                name = name.replace('일십', '십')
+        else:
+            if i == 0:
+                name = digit2mod.get(digit, '')
+            elif i == 1:
+                name = digit2dec.get(digit, '')
+        if digit == '0':
+            if i % 4 == 0:
+                last_three = spelledout[-min(3, len(spelledout)):]
+                if ''.join(last_three) == '':
+                    spelledout.append('')
+                    continue
+            else:
+                spelledout.append('')
+                continue
+        if i == 2:
+            name = digit2name.get(digit, '') + '백'
+            name = name.replace('일백', '백')
+        elif i == 3:
+            name = digit2name.get(digit, '') + '천'
+            name = name.replace('일천', '천')
+        elif i == 4:
+            name = digit2name.get(digit, '') + '만'
+            name = name.replace('일만', '만')
+        elif i == 5:
+            name = digit2name.get(digit, '') + '십'
+            name = name.replace('일십', '십')
+        elif i == 6:
+            name = digit2name.get(digit, '') + '백'
+            name = name.replace('일백', '백')
+        elif i == 7:
+            name = digit2name.get(digit, '') + '천'
+            name = name.replace('일천', '천')
+        elif i == 8:
+            name = digit2name.get(digit, '') + '억'
+        elif i == 9:
+            name = digit2name.get(digit, '') + '십'
+        elif i == 10:
+            name = digit2name.get(digit, '') + '백'
+        elif i == 11:
+            name = digit2name.get(digit, '') + '천'
+        elif i == 12:
+            name = digit2name.get(digit, '') + '조'
+        elif i == 13:
+            name = digit2name.get(digit, '') + '십'
+        elif i == 14:
+            name = digit2name.get(digit, '') + '백'
+        elif i == 15:
+            name = digit2name.get(digit, '') + '천'
+        spelledout.append(name)
+    return ''.join(elem for elem in spelledout)
+def number_to_hangul(text):
+    '''Reference https://github.com/Kyubyong/g2pK'''
+    tokens = set(re.findall(r'(\d[\d,]*)([\uac00-\ud71f]+)', text))
+    for token in tokens:
+        num, classifier = token
+        if classifier[:2] in _korean_classifiers or classifier[0] in _korean_classifiers:
+            spelledout = hangul_number(num, sino=False)
+        else:
+            spelledout = hangul_number(num, sino=True)
+        text = text.replace(f'{num}{classifier}', f'{spelledout}{classifier}')
+    # digit by digit for remaining digits
+    digits = '0123456789'
+    names = '영일이삼사오육칠팔구'
+    for d, n in zip(digits, names):
+        text = text.replace(d, n)
+    return text
+def korean_to_lazy_ipa(text):
+    text = latin_to_hangul(text)
+    text = number_to_hangul(text)
+    text=re.sub('[\uac00-\ud7af]+',lambda x:ko_pron.romanise(x.group(0),'ipa'),text).split('] ~ [')[0]
+    for regex, replacement in _ipa_to_lazy_ipa:
+        text = re.sub(regex, replacement, text)
+    return text

text/mandarin.py ADDED Viewed

	@@ -0,0 +1,171 @@

+import os
+import sys
+import re
+from pypinyin import lazy_pinyin, BOPOMOFO
+import jieba
+import cn2an
+import logging
+logging.getLogger('jieba').setLevel(logging.WARNING)
+jieba.initialize()
+# List of (Latin alphabet, bopomofo) pairs:
+_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('a', 'ㄟˉ'),
+    ('b', 'ㄅㄧˋ'),
+    ('c', 'ㄙㄧˉ'),
+    ('d', 'ㄉㄧˋ'),
+    ('e', 'ㄧˋ'),
+    ('f', 'ㄝˊㄈㄨˋ'),
+    ('g', 'ㄐㄧˋ'),
+    ('h', 'ㄝˇㄑㄩˋ'),
+    ('i', 'ㄞˋ'),
+    ('j', 'ㄐㄟˋ'),
+    ('k', 'ㄎㄟˋ'),
+    ('l', 'ㄝˊㄛˋ'),
+    ('m', 'ㄝˊㄇㄨˋ'),
+    ('n', 'ㄣˉ'),
+    ('o', 'ㄡˉ'),
+    ('p', 'ㄆㄧˉ'),
+    ('q', 'ㄎㄧㄡˉ'),
+    ('r', 'ㄚˋ'),
+    ('s', 'ㄝˊㄙˋ'),
+    ('t', 'ㄊㄧˋ'),
+    ('u', 'ㄧㄡˉ'),
+    ('v', 'ㄨㄧˉ'),
+    ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),
+    ('x', 'ㄝˉㄎㄨˋㄙˋ'),
+    ('y', 'ㄨㄞˋ'),
+    ('z', 'ㄗㄟˋ')
+]]
+# List of (bopomofo, romaji) pairs:
+_bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('ㄅㄛ', 'p⁼wo'),
+    ('ㄆㄛ', 'pʰwo'),
+    ('ㄇㄛ', 'mwo'),
+    ('ㄈㄛ', 'fwo'),
+    ('ㄅ', 'p⁼'),
+    ('ㄆ', 'pʰ'),
+    ('ㄇ', 'm'),
+    ('ㄈ', 'f'),
+    ('ㄉ', 't⁼'),
+    ('ㄊ', 'tʰ'),
+    ('ㄋ', 'n'),
+    ('ㄌ', 'l'),
+    ('ㄍ', 'k⁼'),
+    ('ㄎ', 'kʰ'),
+    ('ㄏ', 'h'),
+    ('ㄐ', 'ʧ⁼'),
+    ('ㄑ', 'ʧʰ'),
+    ('ㄒ', 'ʃ'),
+    ('ㄓ', 'ʦ`⁼'),
+    ('ㄔ', 'ʦ`ʰ'),
+    ('ㄕ', 's`'),
+    ('ㄖ', 'ɹ`'),
+    ('ㄗ', 'ʦ⁼'),
+    ('ㄘ', 'ʦʰ'),
+    ('ㄙ', 's'),
+    ('ㄚ', 'a'),
+    ('ㄛ', 'o'),
+    ('ㄜ', 'ə'),
+    ('ㄝ', 'e'),
+    ('ㄞ', 'ai'),
+    ('ㄟ', 'ei'),
+    ('ㄠ', 'au'),
+    ('ㄡ', 'ou'),
+    ('ㄧㄢ', 'yeNN'),
+    ('ㄢ', 'aNN'),
+    ('ㄧㄣ', 'iNN'),
+    ('ㄣ', 'əNN'),
+    ('ㄤ', 'aNg'),
+    ('ㄧㄥ', 'iNg'),
+    ('ㄨㄥ', 'uNg'),
+    ('ㄩㄥ', 'yuNg'),
+    ('ㄥ', 'əNg'),
+    ('ㄦ', 'əɻ'),
+    ('ㄧ', 'i'),
+    ('ㄨ', 'u'),
+    ('ㄩ', 'ɥ'),
+    ('ˉ', '→'),
+    ('ˊ', '↑'),
+    ('ˇ', '↓↑'),
+    ('ˋ', '↓'),
+    ('˙', ''),
+    ('，', ','),
+    ('。', '.'),
+    ('！', '!'),
+    ('？', '?'),
+    ('—', '-')
+]]
+# List of (romaji, ipa) pairs:
+_romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
+    ('ʃy', 'ʃ'),
+    ('ʧʰy', 'ʧʰ'),
+    ('ʧ⁼y', 'ʧ⁼'),
+    ('NN', 'n'),
+    ('Ng', 'ŋ'),
+    ('y', 'j'),
+    ('h', 'x')
+]]
+def number_to_chinese(text):
+    numbers = re.findall(r'\d+(?:\.?\d+)?', text)
+    for number in numbers:
+        text = text.replace(number, cn2an.an2cn(number), 1)
+    return text
+def chinese_to_bopomofo(text):
+    text = text.replace('、', '，').replace('；', '，').replace('：', '，')
+    words = jieba.lcut(text, cut_all=False)
+    text = ''
+    for word in words:
+        bopomofos = lazy_pinyin(word, BOPOMOFO)
+        if not re.search('[\u4e00-\u9fff]', word):
+            text += word
+            continue
+        for i in range(len(bopomofos)):
+            if re.match('[\u3105-\u3129]', bopomofos[i][-1]):
+                bopomofos[i] += 'ˉ'
+        if text != '':
+            text += ' '
+        text += ''.join(bopomofos)
+    return text
+def latin_to_bopomofo(text):
+    for regex, replacement in _latin_to_bopomofo:
+        text = re.sub(regex, replacement, text)
+    return text
+def bopomofo_to_romaji(text):
+    for regex, replacement in _bopomofo_to_romaji:
+        text = re.sub(regex, replacement, text)
+    return text
+def chinese_to_romaji(text):
+    text = number_to_chinese(text)
+    text = chinese_to_bopomofo(text)
+    text = latin_to_bopomofo(text)
+    text = bopomofo_to_romaji(text)
+    text = re.sub('i[aoe]', lambda x: 'y'+x.group(0)[1:], text)
+    text = re.sub('u[aoəe]', lambda x: 'w'+x.group(0)[1:], text)
+    text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)', lambda x: x.group(1) +
+                  'ɹ`'+x.group(2), text).replace('ɻ', 'ɹ`')
+    text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)',
+                  lambda x: x.group(1)+'ɹ'+x.group(2), text)
+    return text
+def chinese_to_lazy_ipa(text):
+    text = chinese_to_romaji(text)
+    for regex, replacement in _romaji_to_ipa:
+        text = re.sub(regex, replacement, text)
+    return text

text/sanskrit.py ADDED Viewed

	@@ -0,0 +1,62 @@

+import re
+from indic_transliteration import sanscript
+# List of (iast, ipa) pairs:
+_iast_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
+    ('a', 'ə'),
+    ('ā', 'aː'),
+    ('ī', 'iː'),
+    ('ū', 'uː'),
+    ('ṛ', 'ɹ`'),
+    ('ṝ', 'ɹ`ː'),
+    ('ḷ', 'l`'),
+    ('ḹ', 'l`ː'),
+    ('e', 'eː'),
+    ('o', 'oː'),
+    ('k', 'k⁼'),
+    ('k⁼h', 'kʰ'),
+    ('g', 'g⁼'),
+    ('g⁼h', 'gʰ'),
+    ('ṅ', 'ŋ'),
+    ('c', 'ʧ⁼'),
+    ('ʧ⁼h', 'ʧʰ'),
+    ('j', 'ʥ⁼'),
+    ('ʥ⁼h', 'ʥʰ'),
+    ('ñ', 'n^'),
+    ('ṭ', 't`⁼'),
+    ('t`⁼h', 't`ʰ'),
+    ('ḍ', 'd`⁼'),
+    ('d`⁼h', 'd`ʰ'),
+    ('ṇ', 'n`'),
+    ('t', 't⁼'),
+    ('t⁼h', 'tʰ'),
+    ('d', 'd⁼'),
+    ('d⁼h', 'dʰ'),
+    ('p', 'p⁼'),
+    ('p⁼h', 'pʰ'),
+    ('b', 'b⁼'),
+    ('b⁼h', 'bʰ'),
+    ('y', 'j'),
+    ('ś', 'ʃ'),
+    ('ṣ', 's`'),
+    ('r', 'ɾ'),
+    ('l̤', 'l`'),
+    ('h', 'ɦ'),
+    ("'", ''),
+    ('~', '^'),
+    ('ṃ', '^')
+]]
+def devanagari_to_ipa(text):
+    text = text.replace('ॐ', 'ओम्')
+    text = re.sub(r'\s*।\s*$', '.', text)
+    text = re.sub(r'\s*।\s*', ', ', text)
+    text = re.sub(r'\s*॥', '.', text)
+    text = sanscript.transliterate(text, sanscript.DEVANAGARI, sanscript.IAST)
+    for regex, replacement in _iast_to_ipa:
+        text = re.sub(regex, replacement, text)
+    text = re.sub('(.)[`ː]*ḥ', lambda x: x.group(0)
+                  [:-1]+'h'+x.group(1)+'*', text)
+    return text