{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'イエーイ'" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "formal_to_informal = {\n", " \n", " \n", "\n", " 'ワタクシ': 'わたし',\n", " 'チカコ':'しゅうこ',\n", " \"タノヒト\":\"ほかのひと\",\n", "\n", " # Add more mappings as needed\n", "}\n", "\n", "formal_to_informal2 = {\n", "\n", " \"たのひと\":\"ほかのひと\",\n", " \"すうは\": \"かずは\",\n", "\n", "\n", " # Add more mappings as needed\n", "}\n", "\n", "formal_to_informal3 = {\n", "\n", " \"%\":\"%\",\n", " \"@\": \"あっとさいん\",\n", " \"$\":\"どる\",\n", " \"#\":\"はっしゅたぐ\",\n", " \"$\":\"どる\",\n", " \"#\":\"はっしゅたぐ\",\n", " \"何が\":\"なにが\",\n", "\n", " \"何も\":\"なにも\",\n", " \"何か\":\"なにか\",\n", " # \"奏\":\"かなで\",\n", " \"何は\":\"なにが\",\n", " \"お父様\":\"おとうさま\",\n", " \"お兄様\":\"おにいさま\",\n", " \"何を\":\"なにを\",\n", " \"良い\":\"いい\",\n", " \"李衣菜\":\"りいな\",\n", " \"志希\":\"しき\",\n", " \"種\":\"たね\",\n", " \"方々\":\"かたがた\",\n", " \"颯\":\"はやて\",\n", " \"茄子さん\":\"かこさん\",\n", " \"茄子ちゃん\":\"かこちゃん\",\n", " \"涼ちゃん\":\"りょうちゃん\",\n", " \"涼さん\":\"りょうさん\",\n", " \"紗枝\":\"さえ\",\n", " \"文香\":\"ふみか\",\n", " \"私\":\"わたし\",\n", " \"周子\":\"しゅうこ\",\n", " \"イェ\":\"いえ\",\n", " \"可憐\":\"かれん\",\n", " \"加蓮\":\"かれん\",\n", " \"・\":\".\",\n", " \"方の\":\"かたの\",\n", " \"気に\":\"きに\",\n", " \"唯さん\":\"ゆいさん\",\n", " \"唯ちゃん\":\"ゆいちゃん\",\n", " \"聖ちゃん\":\"ひじりちゃん\",\n", " \"他の\":\"ほかの\",\n", " \"他に\":\"ほかに\",\n", " \"一生懸命\":\"いっしょうけんめい\",\n", " \"楓さん\":\"かえでさん\",\n", " \"楓ちゃん\":\"かえでちゃん\",\n", " \"内から\":\"ないから\",\n", " \"の下で\":\"のしたで\",\n", "\n", "}\n", "\n", "\n", "mapper = dict([\n", "\n", " (\"仕方\",\"しかた\"),\n", " (\"明日\",\"あした\"),\n", " ('私',\"わたし\"),\n", " (\"従妹\",\"いとこ\"),\n", " \n", " (\"1人\",\"ひとり\"),\n", " (\"2人\",\"ふたり\"),\n", " \n", " (\"一期\",\"いちご\"),\n", " (\"一会\",\"いちえ\"),\n", " \n", " (\"♪\",\"!\"),\n", " (\"?\",\"?\"),\n", "\n", " (\"どんな方\",\"どんなかた\"),\n", " (\"ふたり暮らし\",\"ふたりぐらし\"),\n", "\n", " (\"新年\",\"しんねん\"),\n", " (\"来年\",\"らいねん\"),\n", " (\"去年\",\"きょねん\"),\n", " (\"壮年\",\"そうねん\"),\n", " (\"今年\",\"ことし\"),\n", "\n", " (\"昨年\",\"さくねん\"),\n", " (\"本年\",\"ほんねん\"),\n", " (\"平年\",\"へいねん\"),\n", " (\"閏年\",\"うるうどし\"),\n", " (\"初年\",\"しょねん\"),\n", " (\"少年\",\"しょうねん\"),\n", " (\"多年\",\"たねん\"),\n", " (\"青年\",\"せいねん\"),\n", " (\"中年\",\"ちゅうねん\"),\n", " (\"老年\",\"ろうねん\"),\n", " (\"成年\",\"せいねん\"),\n", " (\"幼年\",\"ようねん\"),\n", " (\"前年\",\"ぜんねん\"),\n", " (\"元年\",\"がんねん\"),\n", " (\"経年\",\"けいねん\"),\n", " (\"当年\",\"とうねん\"),\n", "\n", " (\"明年\",\"みょうねん\"),\n", " (\"歳年\",\"さいねん\"),\n", " (\"数年\",\"すうねん\"),\n", " (\"半年\",\"はんとし\"),\n", " (\"後年\",\"こうねん\"),\n", " (\"実年\",\"じつねん\"),\n", " (\"年年\",\"ねんねん\"),\n", " (\"連年\",\"れんねん\"),\n", " (\"暦年\",\"れきねん\"),\n", " (\"各年\",\"かくねん\"),\n", " (\"全年\",\"ぜんねん\"),\n", "\n", " (\"年を\",\"としを\"),\n", " (\"年が\",\"としが\"),\n", " (\"年も\",\"としも\"),\n", " (\"年は\",\"としは\"),\n", "\n", "\n", " (\"奏ちゃん\",\"かなでちゃん\"),\n", " (\"負けず嫌い\",\"まけずぎらい\"),\n", " (\"貴方\",\"あなた\"),\n", " (\"貴女\",\"あなた\"),\n", " (\"貴男\",\"あなた\"),\n", "\n", " (\"その節\",\"そのせつ\"),\n", "\n", " (\"何し\",\"なにし\"),\n", " (\"何する\",\"なにする\"),\n", "\n", " (\"心さん\",\"しんさん\"),\n", " (\"心ちゃん\",\"しんちゃん\"),\n", "\n", " (\"乃々\",\"のの\"),\n", "\n", " (\"身体の\",\"からだの\"),\n", " (\"身体が\",\"からだが\"),\n", " (\"身体を\",\"からだを\"),\n", " (\"身体は\",\"からだは\"),\n", " (\"身体に\",\"からだに\"),\n", " (\"正念場\",\"しょうねんば\"),\n", " (\"言う\",\"いう\"),\n", " \n", " \n", " (\"一回\",\"いっかい\"),\n", " (\"一曲\",\"いっきょく\"),\n", " (\"一日\",\"いちにち\"),\n", " (\"一言\",\"ひとこと\"),\n", " (\"一杯\",\"いっぱい\"),\n", " \n", " \n", " (\"方が\",\"ほうが\"),\n", " (\"縦輪城\",\"じゅうりんしろ\"),\n", " (\"深息\",\"しんそく\"),\n", " (\"家人\",\"かじん\"),\n", " (\"お返し\",\"おかえし\"),\n", " (\"化物語\",\"ばけものがたり\"),\n", " (\"阿良々木暦\",\"あららぎこよみ\"),\n", " (\"何より\",\"なにより\")\n", "\n", "\n", "])\n", "\n", "\n", "# Merge all dictionaries into one\n", "all_transformations = {**formal_to_informal, **formal_to_informal2, **formal_to_informal3, **mapper}\n", "\n", "def apply_transformations(text, transformations = all_transformations):\n", " for key, value in transformations.items():\n", " text = text.replace(key, value)\n", " return text\n", "apply_transformations('イエーイ')\n" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'さん人'" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import re\n", "\n", "def number_to_japanese(num):\n", " if not isinstance(num, int) or num < 0 or num > 9999:\n", " return \"Invalid input\"\n", "\n", " digits = [\"\", \"いち\", \"に\", \"さん\", \"よん\", \"ご\", \"ろく\", \"なな\", \"はち\", \"きゅう\"]\n", " tens = [\"\", \"じゅう\", \"にじゅう\", \"さんじゅう\", \"よんじゅう\", \"ごじゅう\", \"ろくじゅう\", \"ななじゅう\", \"はちじゅう\", \"きゅうじゅう\"]\n", " hundreds = [\"\", \"ひゃく\", \"にひゃく\", \"さんびゃく\", \"よんひゃく\", \"ごひゃく\", \"ろっぴゃく\", \"ななひゃく\", \"はっぴゃく\", \"きゅうひゃく\"]\n", " thousands = [\"\", \"せん\", \"にせん\", \"さんぜん\", \"よんせん\", \"ごせん\", \"ろくせん\", \"ななせん\", \"はっせん\", \"きゅうせん\"]\n", "\n", " if num == 0:\n", " return \"ゼロ\"\n", "\n", " result = \"\"\n", " if num >= 1000:\n", " result += thousands[num // 1000]\n", " num %= 1000\n", " if num >= 100:\n", " result += hundreds[num // 100]\n", " num %= 100\n", " if num >= 10:\n", " result += tens[num // 10]\n", " num %= 10\n", " if num > 0:\n", " result += digits[num]\n", "\n", " return result\n", "\n", "def convert_numbers_in_string(input_string):\n", " # Regular expression to find numbers in the string\n", " number_pattern = re.compile(r'\\d+')\n", "\n", " # Function to replace numbers with their Japanese pronunciation\n", " def replace_with_japanese(match):\n", " num = int(match.group())\n", " return number_to_japanese(num)\n", "\n", " # Replace all occurrences of numbers in the string\n", " converted_string = number_pattern.sub(replace_with_japanese, input_string)\n", " return converted_string\n", "\n", "convert_numbers_in_string(\"3人\")\n" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "\n", "roma_mapper = dict([\n", " \n", " ################################\n", "\n", " (\"my\",\"mʲ\"),\n", " (\"by\",\"bʲ\"),\n", " (\"ny\",\"nʲ\"),\n", " (\"ry\",\"rʲ\"),\n", " (\"si\",\"sʲ\"),\n", " (\"ky\",\"kʲ\"),\n", " (\"gy\",\"gʲ\"),\n", " (\"dy\",\"dʲ\"),\n", " (\"di\",\"dʲ\"),\n", " (\"fi\",\"fʲ\"),\n", " (\"fy\",\"fʲ\"),\n", " (\"ch\",\"tɕ\"),\n", " (\"sh\",\"ɕ\"),\n", " \n", " ################################\n", "\n", " (\"a\",\"a\"),\n", " (\"i\",\"i\"),\n", " (\"u\",\"ɯ\"),\n", " (\"e\",\"e\"),\n", " (\"o\",\"o\"),\n", " (\"ka\",\"ka\"),\n", " (\"ki\",\"ki\"),\n", " (\"ku\",\"kɯ\"),\n", " (\"ke\",\"ke\"),\n", " (\"ko\",\"ko\"),\n", " (\"sa\",\"sa\"),\n", " (\"shi\",\"ɕi\"),\n", " (\"su\",\"sɯ\"),\n", " (\"se\",\"se\"),\n", " (\"so\",\"so\"),\n", " (\"ta\",\"ta\"),\n", " (\"chi\",\"tɕi\"),\n", " (\"tsu\",\"tsɯ\"),\n", " (\"te\",\"te\"),\n", " (\"to\",\"to\"),\n", " (\"na\",\"na\"),\n", " (\"ni\",\"ni\"),\n", " (\"nu\",\"nɯ\"),\n", " (\"ne\",\"ne\"),\n", " (\"no\",\"no\"),\n", " (\"ha\",\"ha\"),\n", " (\"hi\",\"çi\"),\n", " (\"fu\",\"ɸɯ\"),\n", " (\"he\",\"he\"),\n", " (\"ho\",\"ho\"),\n", " (\"ma\",\"ma\"),\n", " (\"mi\",\"mi\"),\n", " (\"mu\",\"mɯ\"),\n", " (\"me\",\"me\"),\n", " (\"mo\",\"mo\"),\n", " (\"ra\",\"ɽa\"),\n", " (\"ri\",\"ɽi\"),\n", " (\"ru\",\"ɽɯ\"),\n", " (\"re\",\"ɽe\"),\n", " (\"ro\",\"ɽo\"),\n", " (\"ga\",\"ga\"),\n", " (\"gi\",\"gi\"),\n", " (\"gu\",\"gɯ\"),\n", " (\"ge\",\"ge\"),\n", " (\"go\",\"go\"),\n", " (\"za\",\"za\"),\n", " (\"ji\",\"dʑi\"),\n", " (\"zu\",\"zɯ\"),\n", " (\"ze\",\"ze\"),\n", " (\"zo\",\"zo\"),\n", " (\"da\",\"da\"),\n", " \n", "\n", " (\"zu\",\"zɯ\"),\n", " (\"de\",\"de\"),\n", " (\"do\",\"do\"),\n", " (\"ba\",\"ba\"),\n", " (\"bi\",\"bi\"),\n", " (\"bu\",\"bɯ\"),\n", " (\"be\",\"be\"),\n", " (\"bo\",\"bo\"),\n", " (\"pa\",\"pa\"),\n", " (\"pi\",\"pi\"),\n", " (\"pu\",\"pɯ\"),\n", " (\"pe\",\"pe\"),\n", " (\"po\",\"po\"),\n", " (\"ya\",\"ja\"),\n", " (\"yu\",\"jɯ\"),\n", " (\"yo\",\"jo\"),\n", " (\"wa\",\"wa\"),\n", " \n", "\n", " \n", "\n", " (\"a\",\"a\"),\n", " (\"i\",\"i\"),\n", " (\"u\",\"ɯ\"),\n", " (\"e\",\"e\"),\n", " (\"o\",\"o\"),\n", " (\"wa\",\"wa\"),\n", " (\"o\",\"o\"),\n", "\n", "\n", " (\"wo\",\"o\")])\n", "\n", "nasal_sound = dict([\n", " # before m, p, b\n", " (\"ɴm\",\"mm\"),\n", " (\"ɴb\", \"mb\"),\n", " (\"ɴp\", \"mp\"),\n", " \n", " # before k, g\n", " (\"ɴk\",\"ŋk\"),\n", " (\"ɴg\", \"ŋg\"),\n", " \n", " # before t, d, n, s, z, ɽ\n", " (\"ɴt\",\"nt\"),\n", " (\"ɴd\", \"nd\"),\n", " (\"ɴn\",\"nn\"),\n", " (\"ɴs\", \"ns\"),\n", " (\"ɴz\",\"nz\"),\n", " (\"ɴɽ\", \"nɽ\"),\n", " \n", " (\"ɴɲ\", \"ɲɲ\"),\n", " \n", "])\n", "\n", "def Roma2IPA(text):\n", " orig = text\n", "\n", " for k, v in roma_mapper.items():\n", " text = text.replace(k, v)\n", " \n", " return text\n", "\n", "def nasal_mapper(text):\n", " orig = text\n", "\n", "\n", " for k, v in nasal_sound.items():\n", " text = text.replace(k, v)\n", " \n", " return text\n", "\n", "def alphabetreading(text):\n", " alphabet_dict = {\"A\": \"エイ\",\n", " \"B\": \"ビー\",\n", " \"C\": \"シー\",\n", " \"D\": \"ディー\",\n", " \"E\": \"イー\",\n", " \"F\": \"エフ\",\n", " \"G\": \"ジー\",\n", " \"H\": \"エイチ\",\n", " \"I\":\"アイ\",\n", " \"J\":\"ジェイ\",\n", " \"K\":\"ケイ\",\n", " \"L\":\"エル\",\n", " \"M\":\"エム\",\n", " \"N\":\"エヌ\",\n", " \"O\":\"オー\",\n", " \"P\":\"ピー\",\n", " \"Q\":\"キュー\",\n", " \"R\":\"アール\",\n", " \"S\":\"エス\",\n", " \"T\":\"ティー\",\n", " \"U\":\"ユー\",\n", " \"V\":\"ヴィー\",\n", " \"W\":\"ダブリュー\",\n", " \"X\":\"エックス\",\n", " \"Y\":\"ワイ\",\n", " \"Z\":\"ゼッド\"}\n", " text = text.upper()\n", " text_ret = \"\"\n", " for t in text:\n", " if t in alphabet_dict:\n", " text_ret += alphabet_dict[t]\n", " else:\n", " text_ret += t\n", " return text_ret" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'sekawɴdo'" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import re\n", "import cutlet\n", "\n", "roma_mapper_plus_2 = {\n", " \n", "\"bjo\":'bʲo',\n", "\"rjo\":\"rʲo\",\n", "\"kjo\":\"kʲo\",\n", "\"kyu\":\"kʲu\",\n", " \n", "}\n", "\n", "def replace_repeated_chars(input_string):\n", " result = []\n", " i = 0\n", " while i < len(input_string):\n", " if i + 1 < len(input_string) and input_string[i] == input_string[i + 1] and input_string[i] in 'aiueo':\n", " result.append(input_string[i] + 'ː')\n", " i += 2 \n", " else:\n", " result.append(input_string[i])\n", " i += 1\n", " return ''.join(result)\n", "\n", "\n", "def replace_chars_2(text, mapping=roma_mapper_plus_2):\n", " \n", "\n", " sorted_keys = sorted(mapping.keys(), key=len, reverse=True)\n", "\n", " pattern = '|'.join(re.escape(key) for key in sorted_keys)\n", " \n", "\n", " def replace(match):\n", " key = match.group(0)\n", " return mapping.get(key, key)\n", "\n", " return re.sub(pattern, replace, text)\n", "\n", "\n", "def replace_tashdid_2(s):\n", " vowels = 'aiueoɯ0123456789.?!_。؟?!...@@##$$%%^^&&**()()_+=[「」]>\\`~~―ー∺\"'\n", " result = []\n", " \n", " i = 0\n", " while i < len(s):\n", " if i < len(s) - 2 and s[i].lower() == s[i + 2].lower() and s[i].lower() not in vowels and s[i + 1] == ' ':\n", " result.append('ʔ')\n", " result.append(s[i + 2])\n", " i += 3\n", " elif i < len(s) - 1 and s[i].lower() == s[i + 1].lower() and s[i].lower() not in vowels:\n", " result.append('ʔ')\n", " result.append(s[i + 1])\n", " i += 2\n", " else:\n", " result.append(s[i])\n", " i += 1\n", " \n", " return ''.join(result)\n", "\n", "def replace_tashdid(input_string):\n", " result = []\n", " i = 0\n", " while i < len(input_string):\n", " if i + 1 < len(input_string) and input_string[i] == input_string[i + 1] and input_string[i] not in 'aiueo':\n", " result.append('ʔ')\n", " result.append(input_string[i])\n", " i += 2 # Skip the next character as it is already processed\n", " else:\n", " result.append(input_string[i])\n", " i += 1\n", " return ''.join(result)\n", "\n", "def hira2ipa(text, roma_mapper=roma_mapper):\n", " keys_set = set(roma_mapper.keys())\n", " special_rule = (\"n\", \"ɴ\")\n", " \n", " transformed_text = []\n", " i = 0\n", "\n", " while i < len(text):\n", " if text[i] == special_rule[0]: \n", " if i + 1 == len(text) or text[i + 1] not in keys_set: \n", " transformed_text.append(special_rule[1]) \n", " else:\n", " transformed_text.append(text[i]) \n", " else:\n", " transformed_text.append(text[i]) \n", " \n", " i += 1 \n", "\n", " return ''.join(transformed_text)\n", "\n", "\n", "hira2ipa(\"sekawndo\")" ] }, { "cell_type": "code", "execution_count": 5, "metadata": { "jupyter": { "source_hidden": true } }, "outputs": [], "source": [ "import re\n", "\n", "\n", "k_mapper = dict([\n", " (\"ゔぁ\",\"ba\"),\n", " (\"ゔぃ\",\"bi\"),\n", " (\"ゔぇ\",\"be\"),\n", " (\"ゔぉ\",\"bo\"),\n", " (\"ゔゃ\",\"bʲa\"),\n", " (\"ゔゅ\",\"bʲɯ\"),\n", " (\"ゔゃ\",\"bʲa\"),\n", " (\"ゔょ\",\"bʲo\"),\n", "\n", " (\"ゔ\",\"bɯ\"),\n", "\n", " (\"あぁ\",\" aː\"),\n", " (\"いぃ\",\" iː\"),\n", " (\"いぇ\",\" je\"),\n", " (\"いゃ\",\" ja\"),\n", " (\"うぅ\",\" ɯː\"),\n", " (\"えぇ\",\" eː\"),\n", " (\"おぉ\",\" oː\"),\n", " (\"かぁ\",\" kaː\"),\n", " (\"きぃ\",\" kiː\"),\n", " (\"くぅ\",\"kɯː\"),\n", " (\"くゃ\",\"ka\"),\n", " (\"くゅ\",\"kʲɯ\"),\n", " (\"くょ\",\"kʲo\"),\n", " (\"けぇ\",\"keː\"),\n", " (\"こぉ\",\"koː\"),\n", " (\"がぁ\",\"gaː\"),\n", " (\"ぎぃ\",\"giː\"),\n", " (\"ぐぅ\",\"gɯː\"),\n", " (\"ぐゃ\",\"gʲa\"),\n", " (\"ぐゅ\",\"gʲɯ\"),\n", " (\"ぐょ\",\"gʲo\"),\n", " (\"げぇ\",\"geː\"),\n", " (\"ごぉ\",\"goː\"),\n", " (\"さぁ\",\"saː\"),\n", " (\"しぃ\",\"ɕiː\"),\n", " (\"すぅ\",\"sɯː\"),\n", " (\"すゃ\",\"sʲa\"),\n", " (\"すゅ\",\"sʲɯ\"),\n", " (\"すょ\",\"sʲo\"),\n", " (\"せぇ\",\"seː\"),\n", " (\"そぉ\",\"soː\"),\n", " (\"ざぁ\",\"zaː\"),\n", " (\"じぃ\",\"dʑiː\"),\n", " (\"ずぅ\",\"zɯː\"),\n", " (\"ずゃ\",\"zʲa\"),\n", " (\"ずゅ\",\"zʲɯ\"),\n", " (\"ずょ\",\"zʲo\"),\n", " (\"ぜぇ\",\"zeː\"),\n", " (\"ぞぉ\",\"zeː\"),\n", " (\"たぁ\",\"taː\"),\n", " (\"ちぃ\",\"tɕiː\"),\n", " (\"つぁ\",\"tsa\"),\n", " (\"つぃ\",\"tsi\"),\n", " (\"つぅ\",\"tsɯː\"),\n", " (\"つゃ\",\"tɕa\"),\n", " (\"つゅ\",\"tɕɯ\"),\n", " (\"つょ\",\"tɕo\"),\n", " (\"つぇ\",\"tse\"),\n", " (\"つぉ\",\"tso\"),\n", " (\"てぇ\",\"teː\"),\n", " (\"とぉ\",\"toː\"),\n", " (\"だぁ\",\"daː\"),\n", " (\"ぢぃ\",\"dʑiː\"),\n", " (\"づぅ\",\"dɯː\"),\n", " (\"づゃ\",\"zʲa\"),\n", " (\"づゅ\",\"zʲɯ\"),\n", " (\"づょ\",\"zʲo\"),\n", " (\"でぇ\",\"deː\"),\n", " (\"どぉ\",\"doː\"),\n", " (\"なぁ\",\"naː\"),\n", " (\"にぃ\",\"niː\"),\n", " (\"ぬぅ\",\"nɯː\"),\n", " (\"ぬゃ\",\"nʲa\"),\n", " (\"ぬゅ\",\"nʲɯ\"),\n", " (\"ぬょ\",\"nʲo\"),\n", " (\"ねぇ\",\"neː\"),\n", " (\"のぉ\",\"noː\"),\n", " (\"はぁ\",\"haː\"),\n", " (\"ひぃ\",\"çiː\"),\n", " (\"ふぅ\",\"ɸɯː\"),\n", " (\"ふゃ\",\"ɸʲa\"),\n", " (\"ふゅ\",\"ɸʲɯ\"),\n", " (\"ふょ\",\"ɸʲo\"),\n", " (\"へぇ\",\"heː\"),\n", " (\"ほぉ\",\"hoː\"),\n", " (\"ばぁ\",\"baː\"),\n", " (\"びぃ\",\"biː\"),\n", " (\"ぶぅ\",\"bɯː\"),\n", " (\"ふゃ\",\"ɸʲa\"),\n", " (\"ぶゅ\",\"bʲɯ\"),\n", " (\"ふょ\",\"ɸʲo\"),\n", " (\"べぇ\",\"beː\"),\n", " (\"ぼぉ\",\"boː\"),\n", " (\"ぱぁ\",\"paː\"),\n", " (\"ぴぃ\",\"piː\"),\n", " (\"ぷぅ\",\"pɯː\"),\n", " (\"ぷゃ\",\"pʲa\"),\n", " (\"ぷゅ\",\"pʲɯ\"),\n", " (\"ぷょ\",\"pʲo\"),\n", " (\"ぺぇ\",\"peː\"),\n", " (\"ぽぉ\",\"poː\"),\n", " (\"まぁ\",\"maː\"),\n", " (\"みぃ\",\"miː\"),\n", " (\"むぅ\",\"mɯː\"),\n", " (\"むゃ\",\"mʲa\"),\n", " (\"むゅ\",\"mʲɯ\"),\n", " (\"むょ\",\"mʲo\"),\n", " (\"めぇ\",\"meː\"),\n", " (\"もぉ\",\"moː\"),\n", " (\"やぁ\",\"jaː\"),\n", " (\"ゆぅ\",\"jɯː\"),\n", " (\"ゆゃ\",\"jaː\"),\n", " (\"ゆゅ\",\"jɯː\"),\n", " (\"ゆょ\",\"joː\"),\n", " (\"よぉ\",\"joː\"),\n", " (\"らぁ\",\"ɽaː\"),\n", " (\"りぃ\",\"ɽiː\"),\n", " (\"るぅ\",\"ɽɯː\"),\n", " (\"るゃ\",\"ɽʲa\"),\n", " (\"るゅ\",\"ɽʲɯ\"),\n", " (\"るょ\",\"ɽʲo\"),\n", " (\"れぇ\",\"ɽeː\"),\n", " (\"ろぉ\",\"ɽoː\"),\n", " (\"わぁ\",\"ɯaː\"),\n", " (\"をぉ\",\"oː\"),\n", "\n", " (\"う゛\",\"bɯ\"),\n", " (\"でぃ\",\"di\"),\n", " (\"でぇ\",\"deː\"),\n", " (\"でゃ\",\"dʲa\"),\n", " (\"でゅ\",\"dʲɯ\"),\n", " (\"でょ\",\"dʲo\"),\n", " (\"てぃ\",\"ti\"),\n", " (\"てぇ\",\"teː\"),\n", " (\"てゃ\",\"tʲa\"),\n", " (\"てゅ\",\"tʲɯ\"),\n", " (\"てょ\",\"tʲo\"),\n", " (\"すぃ\",\"si\"),\n", " (\"ずぁ\",\"zɯa\"),\n", " (\"ずぃ\",\"zi\"),\n", " (\"ずぅ\",\"zɯ\"),\n", " (\"ずゃ\",\"zʲa\"),\n", " (\"ずゅ\",\"zʲɯ\"),\n", " (\"ずょ\",\"zʲo\"),\n", " (\"ずぇ\",\"ze\"),\n", " (\"ずぉ\",\"zo\"),\n", " (\"きゃ\",\"kʲa\"),\n", " (\"きゅ\",\"kʲɯ\"),\n", " (\"きょ\",\"kʲo\"),\n", " (\"しゃ\",\"ɕʲa\"),\n", " (\"しゅ\",\"ɕʲɯ\"),\n", " (\"しぇ\",\"ɕʲe\"),\n", " (\"しょ\",\"ɕʲo\"),\n", " (\"ちゃ\",\"tɕa\"),\n", " (\"ちゅ\",\"tɕɯ\"),\n", " (\"ちぇ\",\"tɕe\"),\n", " (\"ちょ\",\"tɕo\"),\n", " (\"とぅ\",\"tɯ\"),\n", " (\"とゃ\",\"tʲa\"),\n", " (\"とゅ\",\"tʲɯ\"),\n", " (\"とょ\",\"tʲo\"),\n", " (\"どぁ\",\"doa\"),\n", " (\"どぅ\",\"dɯ\"),\n", " (\"どゃ\",\"dʲa\"),\n", " (\"どゅ\",\"dʲɯ\"),\n", " (\"どょ\",\"dʲo\"),\n", " (\"どぉ\",\"doː\"),\n", " (\"にゃ\",\"nʲa\"),\n", " (\"にゅ\",\"nʲɯ\"),\n", " (\"にょ\",\"nʲo\"),\n", " (\"ひゃ\",\"çʲa\"),\n", " (\"ひゅ\",\"çʲɯ\"),\n", " (\"ひょ\",\"çʲo\"),\n", " (\"みゃ\",\"mʲa\"),\n", " (\"みゅ\",\"mʲɯ\"),\n", " (\"みょ\",\"mʲo\"),\n", " (\"りゃ\",\"ɽʲa\"),\n", " (\"りぇ\",\"ɽʲe\"),\n", " (\"りゅ\",\"ɽʲɯ\"),\n", " (\"りょ\",\"ɽʲo\"),\n", " (\"ぎゃ\",\"gʲa\"),\n", " (\"ぎゅ\",\"gʲɯ\"),\n", " (\"ぎょ\",\"gʲo\"),\n", " (\"ぢぇ\",\"dʑe\"),\n", " (\"ぢゃ\",\"dʑa\"),\n", " (\"ぢゅ\",\"dʑɯ\"),\n", " (\"ぢょ\",\"dʑo\"),\n", " (\"じぇ\",\"dʑe\"),\n", " (\"じゃ\",\"dʑa\"),\n", " (\"じゅ\",\"dʑɯ\"),\n", " (\"じょ\",\"dʑo\"),\n", " (\"びゃ\",\"bʲa\"),\n", " (\"びゅ\",\"bʲɯ\"),\n", " (\"びょ\",\"bʲo\"),\n", " (\"ぴゃ\",\"pʲa\"),\n", " (\"ぴゅ\",\"pʲɯ\"),\n", " (\"ぴょ\",\"pʲo\"),\n", " (\"うぁ\",\"ɯa\"),\n", " (\"うぃ\",\"ɯi\"),\n", " (\"うぇ\",\"ɯe\"),\n", " (\"うぉ\",\"ɯo\"),\n", " (\"うゃ\",\"ɯʲa\"),\n", " (\"うゅ\",\"ɯʲɯ\"),\n", " (\"うょ\",\"ɯʲo\"),\n", " (\"ふぁ\",\"ɸa\"),\n", " (\"ふぃ\",\"ɸi\"),\n", " (\"ふぅ\",\"ɸɯ\"),\n", " (\"ふゃ\",\"ɸʲa\"),\n", " (\"ふゅ\",\"ɸʲɯ\"),\n", " (\"ふょ\",\"ɸʲo\"),\n", " (\"ふぇ\",\"ɸe\"),\n", " (\"ふぉ\",\"ɸo\"),\n", "\n", " (\"あ\",\" a\"),\n", " (\"い\",\" i\"),\n", " (\"う\",\"ɯ\"),\n", " (\"え\",\" e\"),\n", " (\"お\",\" o\"),\n", " (\"か\",\" ka\"),\n", " (\"き\",\" ki\"),\n", " (\"く\",\" kɯ\"),\n", " (\"け\",\" ke\"),\n", " (\"こ\",\" ko\"),\n", " (\"さ\",\" sa\"),\n", " (\"し\",\" ɕi\"),\n", " (\"す\",\" sɯ\"),\n", " (\"せ\",\" se\"),\n", " (\"そ\",\" so\"),\n", " (\"た\",\" ta\"),\n", " (\"ち\",\" tɕi\"),\n", " (\"つ\",\" tsɯ\"),\n", " (\"て\",\" te\"),\n", " (\"と\",\" to\"),\n", " (\"な\",\" na\"),\n", " (\"に\",\" ni\"),\n", " (\"ぬ\",\" nɯ\"),\n", " (\"ね\",\" ne\"),\n", " (\"の\",\" no\"),\n", " (\"は\",\" ha\"),\n", " (\"ひ\",\" çi\"),\n", " (\"ふ\",\" ɸɯ\"),\n", " (\"へ\",\" he\"),\n", " (\"ほ\",\" ho\"),\n", " (\"ま\",\" ma\"),\n", " (\"み\",\" mi\"),\n", " (\"む\",\" mɯ\"),\n", " (\"め\",\" me\"),\n", " (\"も\",\" mo\"),\n", " (\"ら\",\" ɽa\"),\n", " (\"り\",\" ɽi\"),\n", " (\"る\",\" ɽɯ\"),\n", " (\"れ\",\" ɽe\"),\n", " (\"ろ\",\" ɽo\"),\n", " (\"が\",\" ga\"),\n", " (\"ぎ\",\" gi\"),\n", " (\"ぐ\",\" gɯ\"),\n", " (\"げ\",\" ge\"),\n", " (\"ご\",\" go\"),\n", " (\"ざ\",\" za\"),\n", " (\"じ\",\" dʑi\"),\n", " (\"ず\",\" zɯ\"),\n", " (\"ぜ\",\" ze\"),\n", " (\"ぞ\",\" zo\"),\n", " (\"だ\",\" da\"),\n", " (\"ぢ\",\" dʑi\"),\n", " (\"づ\",\" zɯ\"),\n", " (\"で\",\" de\"),\n", " (\"ど\",\" do\"),\n", " (\"ば\",\" ba\"),\n", " (\"び\",\" bi\"),\n", " (\"ぶ\",\" bɯ\"),\n", " (\"べ\",\" be\"),\n", " (\"ぼ\",\" bo\"),\n", " (\"ぱ\",\" pa\"),\n", " (\"ぴ\",\" pi\"),\n", " (\"ぷ\",\" pɯ\"),\n", " (\"ぺ\",\" pe\"),\n", " (\"ぽ\",\" po\"),\n", " (\"や\",\" ja\"),\n", " (\"ゆ\",\" jɯ\"),\n", " (\"よ\",\" jo\"),\n", " (\"わ\",\" wa\"),\n", " (\"ゐ\",\" i\"),\n", " (\"ゑ\",\" e\"),\n", " (\"ん\",\" ɴ\"),\n", " (\"っ\",\" ʔ\"),\n", " (\"ー\",\" ː\"),\n", "\n", " (\"ぁ\",\" a\"),\n", " (\"ぃ\",\" i\"),\n", " (\"ぅ\",\" ɯ\"),\n", " (\"ぇ\",\" e\"),\n", " (\"ぉ\",\" o\"),\n", " (\"ゎ\",\" ɯa\"),\n", " (\"ぉ\",\" o\"),\n", " (\"っ\",\"?\"),\n", "\n", " (\"を\",\"o\")\n", "\n", "])\n", "\n", "\n", "def post_fix(text):\n", " orig = text\n", "\n", " for k, v in k_mapper.items():\n", " text = text.replace(k, v)\n", "\n", " return text\n", "\n", "\n", "\n", "\n", "sym_ws = dict([\n", " \n", " (\"$ \",\"dorɯ\"),\n", " (\"$ \",\"dorɯ\"),\n", " \n", " (\"〇 \",\"marɯ\"),\n", " (\"¥ \",\"eɴ\"),\n", "\n", " (\"# \",\"haʔɕɯ tagɯ\"),\n", " (\"# \",\"haʔɕɯ tagɯ\"),\n", " \n", " (\"& \",\"ando\"),\n", " (\"& \",\"ando\"),\n", " \n", " (\"% \",\"paːsento\"),\n", " (\"% \",\"paːsento\"),\n", " \n", " (\"@ \",\"aʔto saiɴ\"),\n", " (\"@ \",\"aʔto saiɴ\")\n", " \n", "\n", " \n", "])\n", "\n", "def random_sym_fix(text): # with space\n", " orig = text\n", "\n", " for k, v in sym_ws.items():\n", " text = text.replace(k, f\" {v} \")\n", " \n", " return text\n", "\n", "\n", "sym_ns = dict([\n", " \n", " (\"$\",\"dorɯ\"),\n", " (\"$\",\"dorɯ\"),\n", " \n", " (\"〇\",\"marɯ\"),\n", " (\"¥\",\"eɴ\"),\n", "\n", " (\"#\",\"haʔɕɯ tagɯ\"),\n", " (\"#\",\"haʔɕɯ tagɯ\"),\n", " \n", " (\"&\",\"ando\"),\n", " (\"&\",\"ando\"),\n", " \n", " (\"%\",\"paːsento\"),\n", " (\"%\",\"paːsento\"),\n", " \n", " (\"@\",\"aʔto saiɴ\"),\n", " (\"@\",\"aʔto saiɴ\"),\n", " \n", " (\"~\",\"—\"),\n", " (\"kʲɯɯdʑɯɯkʲɯɯ.kʲɯɯdʑɯɯ\",\"kʲɯɯdʑɯɯ kʲɯɯ teɴ kʲɯɯdʑɯɯ\")\n", " \n", "\n", " \n", "\n", " \n", "])\n", "\n", "def random_sym_fix_no_space(text):\n", " orig = text\n", "\n", " for k, v in sym_ns.items():\n", " text = text.replace(k, f\" {v} \")\n", " \n", " return text\n", "\n", "\n", "spaces = dict([\n", " \n", " (\"ɯ ɴ\",\"ɯɴ\"),\n", " (\"na ɴ \",\"naɴ \"),\n", " (\" mina \", \" miɴna \"),\n", " (\"ko ɴ ni tɕi ha\",\"konnitɕiwa\"),\n", " (\"ha i\",\"hai\"),\n", " (\"boɯtɕama\",\"boʔtɕama\"),\n", " (\"i eːi\",\"ieːi\"),\n", " (\"taiɕɯtsɯdʑoɯ\",\"taiɕitsɯdʑoɯ\"),\n", " (\"soɴna ka ze ni\",\"soɴna fɯɯ ni\"),\n", " (\" i e \",\"ke \"),\n", " (\"�\",\"\"),\n", " (\"×\",\" batsɯ \"),\n", " (\"se ka ɯndo\",\"sekaɯndo\"),\n", " (\"i i\",\"iː\"),\n", " (\"i tɕi\",\"itɕi\"),\n", " (\"ka i\",\"kai\"),\n", " (\"naɴ ga\",\"nani ga\"),\n", " (\"i eː i\",\"ieːi\"),\n", " \n", " (\"naɴ koɽe\",\"nani koɽe\"),\n", " (\"naɴ soɽe\",\"nani soɽe\"),\n", " (\" ɕeɴ \",\" seɴ \"),\n", " \n", " # (\"konna\",\"koɴna\"),\n", " # (\"sonna\",\" soɴna \"),\n", " # (\"anna\",\"aɴna\"),\n", " # (\"nn\",\"ɴn\"),\n", " \n", " (\"en \",\"eɴ \"),\n", " (\"in \",\"iɴ \"),\n", " (\"an \",\"aɴ \"),\n", " (\"on \",\"oɴ \"),\n", " (\"ɯn \",\"ɯɴ \"),\n", " # (\"nd\",\"ɴd\"),\n", " \n", " (\"koɴd o\",\"kondo\"),\n", " (\"ko ɴ d o\",\"kondo\"),\n", " (\"ko ɴ do\",\"kondo\"),\n", " \n", " (\"oanitɕaɴ\",\"oniːtɕaɴ\"),\n", " (\"oanisaɴ\",\"oniːsaɴ\"),\n", " (\"oanisama\",\"oniːsama\"),\n", " (\"hoːmɯrɯɴɯ\",\"hoːmɯrɯːmɯ\"),\n", " (\"so ɴ na \",\"sonna\"),\n", " (\" sonna \",\" sonna \"),\n", " (\" konna \",\" konna \"),\n", " (\"ko ɴ na \",\"konna\"),\n", " (\" ko to \",\" koto \"),\n", " (\"edʑdʑi\",\"eʔtɕi\"),\n", " (\" edʑdʑ \",\" eʔtɕi \"),\n", " (\" dʑdʑ \",\" dʑiːdʑiː \"),\n", " (\"secɯnd\",\"sekaɯndo\"),\n", " \n", " (\"ɴɯ\",\"nɯ\"),\n", " (\"ɴe\",\"ne\"),\n", " (\"ɴo\",\"no\"),\n", " (\"ɴa\",\"na\"),\n", " (\"ɴi\",\"ni\"),\n", " (\"ɴʲ\",\"nʲ\"),\n", " \n", " (\"hotond o\",\"hotondo\"),\n", " (\"hakoɴd e\",\"hakoɴde\"),\n", " (\"gakɯtɕi ɽi\",\"gaʔtɕiɽi \"),\n", " \n", " (\" ʔ\",\"ʔ\"),\n", " (\"ʔ \",\"ʔ\"),\n", " \n", " (\"-\",\"ː\"),\n", " (\"- \",\"ː\"),\n", " (\"--\",\"~ː\"),\n", " (\"~\",\"—\"),\n", " (\"、\",\",\"),\n", " \n", " (\" ː\",\"ː\"),\n", " ('ka nade',\"kanade\"),\n", "\n", " (\"ohahasaɴ\",\"okaːsaɴ\"),\n", " (\" \",\" \"),\n", " (\"viː\",\"bɯiː\"),\n", " (\"ːː\",\"ː—\"),\n", " \n", " (\"d ʑ\",\"dʑ\"),\n", " (\"d a\",\"da\"),\n", " (\"d e\",\"de\"),\n", " (\"d o\",\"do\"),\n", " (\"d ɯ\",\"dɯ\"),\n", " \n", " (\"niːɕiki\",\"ni iɕiki\"),\n", " (\"anitɕaɴ\",\"niːtɕaɴ\"),\n", " (\"daiːtɕi\",\"dai itɕi\"),\n", " (\"niːta\",\"ni ita\"),\n", " (\"niːrɯ\",\"ni irɯ\"),\n", " (\"a—\",\"aː\"),\n", " \n", " (\"naɴ sono\",\"nani sono\"),\n", " (\"naɴ kono\",\"nani kono\"),\n", " (\"naɴ ano\",\"nani ano\"), # Cutlet please fix your shit\n", " (\" niːtaɽa\",\" ni itaɽa\"),\n", " (\"doɽamaɕiːd\",\"doɽama ɕiːdʲi\"),\n", " (\"aɴ ta\",\"anta\"),\n", " (\"aɴta\",\"anta\"),\n", " (\"naniːʔteɴ\",\"nani iʔteɴ\"),\n", " (\"niːkite\",\"ni ikite\")\n", "\n", "])\n", "\n", "\n", "\n", "def random_space_fix(text):\n", " orig = text\n", "\n", " for k, v in spaces.items():\n", " text = text.replace(k, v)\n", " \n", " return text" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "niʔpoɴ ni wa, ɯtsɯkɯɕiː ɕizeɴ to jɯtaka na bɯŋka gaːɽimasɯ. tokɯni, ɕiki no heŋka wa niʔpoɴ no mirʲokɯ no çitotsɯ desɯ. harɯ ni wa sakɯɽa no hana ga saki, oːkɯ no çitobito ga hanami ni dekakemasɯ. sakɯɽa no ɕita de kazokɯ ja jɯɯdʑiɴ to iʔɕo ni obento o tabe taɽi, osake o noɴ daɽi ɕite tanoɕimɯ no ga iʔpanteki desɯ. natsɯ ni wa, kakɯtɕi de hanabi taikai ga okonawaɽemasɯ. jozoɽa ni ɯtɕiageɽaɽerɯ hanabi wa totemo ɯtsɯkɯɕikɯ, oːkɯ no çitobito ga jɯkata o kite saŋka ɕimasɯ. mata, natsɯ wa ɯmi ja pɯɯrɯ de ojogɯ no mo tanoɕiː desɯ. ɯmibe de wa, sandoiʔtɕi jaːisɯ kɯɽiːmɯ o tabenagaɽa, nami to tawamɯɽerɯ koto mo dekimasɯ. aki ni wa, koɯjoɯ ga ɯtsɯkɯɕikɯ, oːkɯ no çitobito ga momidʑigaɽi ni dekakemasɯ. tokɯni kʲoto no aɽaɕijama ja naɽa no naɽa koɯeɴ wa, koɯjoɯ ga jɯɯmei desɯ. koɯjoɯ no ɕita o sampo ɕinagaɽa, ɕizeɴ no ɯtsɯkɯɕisa o kandʑirɯ koto ga dekimasɯ. mata, aki wa oiɕiː kɯdamono ja jasai ga takɯsaɴ toɽerɯ kisetsɯ de mo aɽimasɯ. ɽiŋgo ja satsɯma imo, soɕite kɯɽi nado, aki no mikakɯ o tanoɕimɯ koto ga dekimasɯ. fɯjɯ ni wa, jɯki ga fɯɽi, tokɯni hoʔkaidoɯ ja toɯhokɯ tɕihoɯ de wa jɯkimatsɯɽi ga kaisai saɽemasɯ. setsɯzoɯ ja koːɽi no tɕoɯkokɯ ga tendʑi saɽe, jorɯ ni wa ɽaito aʔpɯ saɽete totemo gensoɯteki desɯ. mata, onseɴ ni hairɯ no mo fɯjɯ no tanoɕimi no çitotsɯ desɯ. atatakai ojɯ ni tsɯkaɽinagaɽa, soto no jɯki keɕiki o nagamerɯ no wa saikoɯ no zeitakɯ desɯ. niʔpoɴ ni wa, kono joɯ ni ɕikigoto nitɕigaʔta mirʲokɯ gaːɽimasɯ. dono kisetsɯ ni otozɯɽete mo, ataɽaɕiː haʔkeɴ ja kandoɯ gaːrɯ koto deɕoɯ. niʔpoɴ no ɕizeɴ to bɯŋka o taikeɴ ɕite, kokoɽo jɯtaka na tabi o tanoɕinde kɯdasai.\n" ] } ], "source": [ "katsu = cutlet.Cutlet(ensure_ascii=False)\n", "katsu.use_foreign_spelling = False\n", "\n", "def process_japanese_text(ml):\n", " # Check for small characters and replace them\n", " if any(char in ml for char in \"ぁぃぅぇぉ\"):\n", " \n", " ml = ml.replace(\"ぁ\", \"あ\")\n", " ml = ml.replace(\"ぃ\", \"い\")\n", " ml = ml.replace(\"ぅ\", \"う\")\n", " ml = ml.replace(\"ぇ\", \"え\")\n", " ml = ml.replace(\"ぉ\", \"お\")\n", "\n", " # Initialize Cutlet for romaji conversion\n", "\n", " # Convert to romaji and apply transformations\n", " # output = katsu.romaji(ml, capitalize=False).lower()\n", "\n", " output = katsu.romaji(apply_transformations(alphabetreading(ml)), capitalize=False).lower()\n", " \n", "\n", " # Replace specific romaji sequences\n", " if 'j' in output:\n", " output = output.replace('j', \"dʑ\")\n", " if 'tt' in output:\n", " output = output.replace('tt', \"ʔt\")\n", " if 't t' in output:\n", " output = output.replace('t t', \"ʔt\")\n", " if ' ʔt' in output:\n", " output = output.replace(' ʔt', \"ʔt\")\n", " if 'ssh' in output:\n", " output = output.replace('ssh', \"ɕɕ\")\n", "\n", " # Convert romaji to IPA\n", " output = Roma2IPA(convert_numbers_in_string(output))\n", "\n", " \n", " output = hira2ipa(output)\n", "\n", " # Apply additional transformations\n", " output = replace_chars_2(output)\n", " output = replace_repeated_chars(replace_tashdid_2(output))\n", " output = nasal_mapper(output)\n", "\n", " # Final adjustments\n", " if \" ɴ\" in output:\n", " output = output.replace(\" ɴ\", \"ɴ\")\n", " \n", " if ' neɽitai ' in output:\n", " output = output.replace(' neɽitai ', \"naɽitai\")\n", "\n", " if 'harɯdʑisama' in output:\n", " output = output.replace('harɯdʑisama', \"arɯdʑisama\")\n", "\n", "\n", " if \"ki ni ɕinai\" in output:\n", " output = re.sub(r'(?= 3:\n", "# return pattern + \"~~~\"\n", "# return match.group(0)\n", "\n", "# # Pattern for space-separated repeats\n", "# pattern1 = r'((?:\\S+\\s+){1,5}?)(?:\\1){2,}'\n", "# # Pattern for continuous repeats without spaces\n", "# pattern2 = r'(.+?)\\1{2,}'\n", "\n", "# text = re.sub(pattern1, replace_repeats, text)\n", "# text = re.sub(pattern2, replace_repeats, text)\n", "# return text\n", "\n", "\n", "def replace_repeating_a(output):\n", " # Define patterns and their replacements\n", " patterns = [\n", " (r'(aː)\\s*\\1+\\s*', r'\\1~'), # Replace repeating \"aː\" with \"aː~~\"\n", " (r'(aːa)\\s*aː', r'\\1~'), # Replace \"aːa aː\" with \"aː~~\"\n", " (r'aːa', r'aː~'), # Replace \"aːa\" with \"aː~\"\n", " (r'naː\\s*aː', r'naː~'), # Replace \"naː aː\" with \"naː~\"\n", " (r'(oː)\\s*\\1+\\s*', r'\\1~'), # Replace repeating \"oː\" with \"oː~~\"\n", " (r'(oːo)\\s*oː', r'\\1~'), # Replace \"oːo oː\" with \"oː~~\"\n", " (r'oːo', r'oː~'), # Replace \"oːo\" with \"oː~\"\n", " (r'(eː)\\s*\\1+\\s*', r'\\1~'), \n", " (r'(e)\\s*\\1+\\s*', r'\\1~'), \n", " (r'(eːe)\\s*eː', r'\\1~'), \n", " (r'eːe', r'eː~'), \n", " (r'neː\\s*eː', r'neː~'), \n", " ]\n", "\n", " \n", " # Apply each pattern to the output\n", " for pattern, replacement in patterns:\n", " output = re.sub(pattern, replacement, output)\n", " \n", " return output\n", "\n", "def phonemize(text):\n", " \n", " # if \"っ\" in text:\n", " # text = text.replace(\"っ\",\"ʔ\")\n", " \n", " output = post_fix(process_japanese_text(text))\n", " #output = text\n", " \n", " if \" ɴ\" in output:\n", " output = output.replace(\" ɴ\", \"ɴ\")\n", " if \"y\" in output:\n", " output = output.replace(\"y\", \"j\")\n", " if \"ɯa\" in output:\n", " output = output.replace(\"ɯa\", \"wa\")\n", " \n", " if \"a aː\" in output:\n", " output = output.replace(\"a aː\",\"a~\")\n", " if \"a a\" in output:\n", " output = output.replace(\"a a\",\"a~\")\n", "\n", "\n", "\n", " \n", " \n", " output = replace_repeating_a((output))\n", " output = re.sub(r'\\s+~', '~', output)\n", " \n", " if \"oː~o oː~ o\" in output:\n", " output = output.replace(\"oː~o oː~ o\",\"oː~~~~~~\")\n", " if \"aː~aː\" in output:\n", " output = output.replace(\"aː~aː\",\"aː~~~\")\n", " if \"oɴ naː\" in output:\n", " output = output.replace(\"oɴ naː\",\"onnaː\")\n", " if \"aː~~ aː\" in output:\n", " output = output.replace(\"aː~~ aː\",\"aː~~~~\")\n", " if \"oː~o\" in output:\n", " output = output.replace(\"oː~o\",\"oː~~\")\n", " if \"oː~~o o\" in output:\n", " output = output.replace(\"oː~~o o\",\"oː~~~~\") # yeah I'm too tired to learn regex how did you know\n", "\n", " output = random_space_fix(output)\n", " output = random_sym_fix(output) # fixing some symbols, if they have a specific white space such as miku& sakura -> miku ando sakura\n", " output = random_sym_fix_no_space(output) # same as above but for those without white space such as miku&sakura -> miku ando sakura\n", " # if \"ɯ\" in output:\n", " # output = output.replace(\"ɯ\",\"U\")ss\n", " # if \"ʔ\" in output:\n", " # output = output.replace(\"ʔ\",\"!\")\n", " \n", " return output.lstrip()\n", "# def process_row(row):\n", "# return {'phonemes': [phonemize(word) for word in row['phonemes']]}\n", "\n", "\n", "# Example usage\n", "text = \"\"\"日本には、美しい自然と豊かな文化があります。特に、四季の変化は日本の魅力の一つです。春には桜の花が咲き、多くの人々が花見に出かけます。桜の下で家族や友人と一緒にお弁当を食べたり、お酒を飲んだりして楽しむのが一般的です。\n", "\n", "夏には、各地で花火大会が行われます。夜空に打ち上げられる花火はとても美しく、多くの人々が浴衣を着て参加します。また、夏は海やプールで泳ぐのも楽しいです。海辺では、サンドイッチやアイスクリームを食べながら、波と戯れることもできます。\n", "\n", "秋には、紅葉が美しく、多くの人々が紅葉狩りに出かけます。特に京都の嵐山や奈良の奈良公園は、紅葉が有名です。紅葉の下を散歩しながら、自然の美しさを感じることができます。また、秋はおいしい果物や野菜がたくさん取れる季節でもあります。りんごやさつまいも、そして栗など、秋の味覚を楽しむことができます。\n", "\n", "冬には、雪が降り、特に北海道や東北地方では雪祭りが開催されます。雪像や氷の彫刻が展示され、夜にはライトアップされてとても幻想的です。また、温泉に入るのも冬の楽しみの一つです。暖かいお湯に浸かりながら、外の雪景色を眺めるのは最高の贅沢です。\n", "\n", "日本には、このように四季ごとに違った魅力があります。どの季節に訪れても、新しい発見や感動があることでしょう。日本の自然と文化を体験して、心豊かな旅を楽しんでください。\"\"\"\n", "result = phonemize(text)\n", "print(result)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "ref_texts = {}\n", "ref_texts['Happy'] = \"neː neː, iʔɕo ni kako e no tabi niːkanai? dʑiɴrɯi ga tsɯkɯʔta tɕoɯsɯgoi tatemono to ka mi niːkerɯɴ da jo!\"\n", "ref_texts['Sad'] = \"zanneɴ naɴ dakedo, keiki to ɕinɽai o toɽimodosoɯ to ɕiteta no ni, kanaɽi koɯtai ɕitɕaʔtaɴ da.\"\n", "ref_texts['Angry'] = \"temmoŋgakɯ nante bakagete irɯ! sono ɽiɽoɴ wa keʔkandaɽake no kansatsɯ to katajoʔta kaiɕakɯ ni motozɯite irɯɴ da!\"\n", "ref_texts['Surprised'] = \"ɕindʑiɽaɽenai! kono ike de ataɽaɕi tane no bakɯteɽia o haʔkeɴ ɕitaʔte hontoɯ na no?\"" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "\n", "df3.to_csv(\"/home/austin/disk1/stts-zs_cleaning/data/moe_soshy/Japanese/imas_split/shiki/shiki_fine/shiki_finetune.csv\", index=False, sep=\"|\", header=None)" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "8efe9d5252f64d799ced920633af72e0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/2233 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/plain": [ "'UFUFUFU!'" ] }, "execution_count": 26, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# from datasets import load_dataset\n", "# dataset = load_dataset(\"Respair/ehehe_chunk\")\n", "# dataset = dataset['train'].select(range(2250))\n", "\n", "# dataset = dataset.filter(lambda x: x['transcription'] is not None and x['transcription'] != '')\n", "\n", "# dataset = dataset.map(lambda x: phonemize(x['transcription']))\n", "# dataset = dataset.remove_columns('transcription')\n", "# dataset[0]['sentence']" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e5297a720b134b29a2a1808cc7df5a98", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/2233 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "dataset = dataset.map(lambda x: {'path': x['audio']['path']})" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2f0b72d8ad9f455686dbbbb67d33bde1", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Saving the dataset (0/1 shards): 0%| | 0/2233 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "dataset.save_to_disk(\"/home/austin/disk1/mvc_cache/resp_ds/Fa_Data_14sep2024/ehehe/unproc\")" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "/home/austin/disk2/llmvcs/tt\n" ] } ], "source": [ "%cd /home/austin/disk2/llmvcs/tt\n", "\n", "from cotlet.phon import phonemize\n", "import cutlet" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "'ɕoɯ neɴ'" ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "\n", "phonemize(\"少年\")" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Renamed: CN_031.mp3(1).wav -> CN_031.wav\n", "Renamed: CN_010.mp3(1).wav -> CN_010.wav\n", "Renamed: CN_021.mp3(1).wav -> CN_021.wav\n", "Renamed: CN_030.mp3(1).wav -> CN_030.wav\n", "Renamed: CN_011.mp3(1).wav -> CN_011.wav\n", "Renamed: CN_007.mp3(1).wav -> CN_007.wav\n", "Renamed: CN_006.mp3(1).wav -> CN_006.wav\n", "Renamed: CN_028.mp3(1).wav -> CN_028.wav\n", "Renamed: CN_022.mp3(1).wav -> CN_022.wav\n", "Renamed: CN_027.mp3(1).wav -> CN_027.wav\n", "Renamed: CN_029.mp3(1).wav -> CN_029.wav\n", "Renamed: CN_036.mp3(1).wav -> CN_036.wav\n", "Renamed: CN_033.mp3(1).wav -> CN_033.wav\n", "Renamed: CN_025.mp3(1).wav -> CN_025.wav\n", "Renamed: CN_026.mp3(1).wav -> CN_026.wav\n", "Renamed: CN_023.mp3(1).wav -> CN_023.wav\n", "Renamed: CN_042.mp3(1).wav -> CN_042.wav\n", "Renamed: CN_005.mp3(1).wav -> CN_005.wav\n", "Renamed: CN_013.mp3(1).wav -> CN_013.wav\n", "Renamed: CN_019.mp3(1).wav -> CN_019.wav\n", "Renamed: CN_002.mp3(1).wav -> CN_002.wav\n", "Renamed: CN_037.mp3(1).wav -> CN_037.wav\n", "Renamed: CN_020.mp3(1).wav -> CN_020.wav\n", "Renamed: CN_018.mp3(1).wav -> CN_018.wav\n", "Renamed: CN_004.mp3(1).wav -> CN_004.wav\n", "Renamed: CN_008.mp3(1).wav -> CN_008.wav\n", "Renamed: CN_024.mp3(1).wav -> CN_024.wav\n", "Renamed: CN_014.mp3(1).wav -> CN_014.wav\n", "Renamed: CN_032.mp3(1).wav -> CN_032.wav\n", "Renamed: CN_012.mp3(1).wav -> CN_012.wav\n", "Renamed: CN_017.mp3(1).wav -> CN_017.wav\n", "Renamed: CN_003.mp3(1).wav -> CN_003.wav\n" ] } ], "source": [ "import os\n", "\n", "# Define the directory path\n", "directory = \"/home/austin/disk1/stts-zs_cleaning/data/moe_soshy/Japanese/saori\"\n", "\n", "# Iterate over all files in the directory\n", "for filename in os.listdir(directory):\n", " # Check if the file name contains \".mp3(1)\"\n", " if \".mp3(1)\" in filename:\n", " # Construct the new file name by removing \".mp3(1)\"\n", " new_filename = filename.replace(\".mp3(1)\", \"\")\n", " \n", " # Construct the full file paths\n", " old_file_path = os.path.join(directory, filename)\n", " new_file_path = os.path.join(directory, new_filename)\n", " \n", " # Rename the file\n", " os.rename(old_file_path, new_file_path)\n", " print(f\"Renamed: {filename} -> {new_filename}\")" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "93.4 μs ± 9.95 μs per loop (mean ± std. dev. of 7 runs, 10,000 loops each)\n" ] } ], "source": [ "%%timeit\n", "class stuff:\n", " def __init__(self):\n", " self.phon = phonemize\n", " \n", " def __call__(self,text):\n", " ps = self.phon(text)\n", " return ps\n", " \n", "lol = stuff()\n", "\n", "lol(\"少年\")\n", "\n", "\n" ] }, { "cell_type": "code", "execution_count": 36, "metadata": {}, "outputs": [], "source": [ "import os\n", "os.environ[\"CUDA_VISIBLE_DEVICES\"] = \"0\"\n" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "MOLADEALD ALALD ALALD WALLD.\n" ] } ], "source": [ "\n", "# pip install vllm\n", "\n", "from openai import OpenAI\n", "\n", "\n", "openai_api_key = \"EMPTY\"\n", "openai_api_base = \"http://localhost:8000/v1\"\n", "\n", "client = OpenAI(\n", " api_key=openai_api_key,\n", " base_url=openai_api_base,\n", ")\n", "\n", "model_name = \"Respair/Test_QwJP\"\n", "\n", "\n", "def p2g(param):\n", "\n", " chat_response = client.chat.completions.create(\n", "\n", " model=model_name,\n", " max_tokens=512,\n", "\n", "\n", " messages=[\n", " \n", " {\"role\": \"user\", \"content\": f\"{param}\"}]\n", " ) \n", " \n", " result = chat_response.choices[0].message.content\n", " if \" \" in result:\n", " result = result.replace(\" \",\" \")\n", "\n", " return result.lstrip()\n", "\n", "\n", "prompt= f\"\"\"Turn IPA to Japanese: mendoɯ dakaɽa.\"\"\"\n", "\n", "result= p2g(prompt)\n", "\n", "print(result)\n" ] }, { "cell_type": "code", "execution_count": 81, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "646f07f3a8f14b3da091d085b5989df0", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Map: 0%| | 0/13 [00:00, ? examples/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "def create_chat_template(example):\n", " chat = [\n", " {\"role\": \"user\", \"content\": f\"convert this pronunciation back to normal japanese if you see one, otherwise copy the same thing: {example['IPA2']}\"},\n", " {\"role\": \"assistant\", \"content\":example['text']}\n", " ]\n", " return {\"messages\": chat}\n", "\n", "# Assuming your dataset is called 'dataset'\n", "dataset2 = dataset2.map(create_chat_template, num_proc=1)\n" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "473\n" ] } ], "source": [ "import os\n", "print(len(os.listdir(\"/home/austin/disk2/llmvcs/tt/stylekan/Data/data\")))" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Folder contents zipped successfully to /home/austin/disk1/mvc_cache/moe.zip\n" ] } ], "source": [ "import os\n", "import zipfile\n", "\n", "## Backing Up to upload to hf and then delete to save space here\n", "\n", "def zip_folder(source_folder, output_zip):\n", "\n", " if not os.path.exists(source_folder):\n", " print(f\"Error: The source folder '{source_folder}' does not exist.\")\n", " return\n", "\n", " output_dir = os.path.dirname(output_zip)\n", " if not os.path.exists(output_dir):\n", " os.makedirs(output_dir)\n", "\n", " with zipfile.ZipFile(output_zip, 'w', zipfile.ZIP_DEFLATED) as zipf:\n", "\n", " for root, _, files in os.walk(source_folder):\n", " for file in files:\n", "\n", " file_path = os.path.join(root, file)\n", "\n", " relative_path = os.path.relpath(file_path, source_folder)\n", "\n", " zipf.write(file_path, relative_path)\n", "\n", " print(f\"Folder contents zipped successfully to {output_zip}\")\n", "\n", "source_folder = \"/home/austin/disk2/llmvcs/tt/stylekan/Data/data\"\n", "output_zip = \"/home/austin/disk1/mvc_cache/moe.zip\"\n", "\n", "zip_folder(source_folder, output_zip)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "414d12f2c1a14dbca6c2e7ef80b74390", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Zipping part 1: 0%| | 0/50 [00:00, ?folder/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "644de07b33174b5e818a332beddcd8bf", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Zipping part 2: 0%| | 0/50 [00:00, ?folder/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "e19df57c0de749d0b8b57ae5e124f64f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Zipping part 3: 0%| | 0/50 [00:00, ?folder/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "2457c96efe9345cfb6a86e9075bd257e", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Zipping part 4: 0%| | 0/50 [00:00, ?folder/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "0c552bcbe33447f6a5a1a22ede13270b", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Zipping part 5: 0%| | 0/50 [00:00, ?folder/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "ea2b8a10f4e64edbb60d73961b2aa41c", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Zipping part 6: 0%| | 0/50 [00:00, ?folder/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "582ee5c26da34f27a848cc09e4ab4595", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Zipping part 7: 0%| | 0/50 [00:00, ?folder/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "0ee8752888ed4662916eb80071adb18d", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Zipping part 8: 0%| | 0/50 [00:00, ?folder/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "55c1e433675543f29a94835260f53eb2", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Zipping part 9: 0%| | 0/50 [00:00, ?folder/s]" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "1bb9eb7ad8454bcab70e6ad87db77e2f", "version_major": 2, "version_minor": 0 }, "text/plain": [ "Zipping part 10: 0%| | 0/23 [00:00, ?folder/s]" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import os\n", "import zipfile\n", "from tqdm.notebook import tqdm \n", "\n", "def zip_folders_in_parts(source_dir, dest_dir, part_prefix, folders_per_part):\n", "\n", " all_folders = [f for f in os.listdir(source_dir) if os.path.isdir(os.path.join(source_dir, f))]\n", " all_folders.sort() \n", "\n", " \n", " num_parts = (len(all_folders) + folders_per_part - 1) // folders_per_part\n", "\n", " for part_num in range(num_parts):\n", "\n", " start_index = part_num * folders_per_part\n", " end_index = min(start_index + folders_per_part, len(all_folders))\n", " part_folders = all_folders[start_index:end_index]\n", "\n", " zip_filename = os.path.join(dest_dir, f\"{part_prefix}_part{part_num + 1}.zip\")\n", " with zipfile.ZipFile(zip_filename, 'w', zipfile.ZIP_DEFLATED) as zipf:\n", " for folder in tqdm(part_folders, desc=f\"Zipping part {part_num + 1}\", unit=\"folder\"):\n", " folder_path = os.path.join(source_dir, folder)\n", " for root, _, files in os.walk(folder_path):\n", " for file in files:\n", " file_path = os.path.join(root, file)\n", " arcname = os.path.relpath(file_path, source_dir)\n", " zipf.write(file_path, arcname)\n", "\n", "source_directory = \"/home/austin/disk2/llmvcs/tt/stylekan/Data/data\"\n", "destination_directory = \"/home/austin/disk1/mvc_cache\"\n", "part_prefix = \"moe\"\n", "folders_per_part = 50\n", "\n", "\n", "zip_folders_in_parts(source_directory, destination_directory, part_prefix, folders_per_part)" ] }, { "cell_type": "code", "execution_count": 88, "metadata": {}, "outputs": [], "source": [ "def process_content(row):\n", " for item in row:\n", " if item['role'] == 'assistant':\n", " item['content'] = item['content'].lstrip()\n", " return row\n", "\n", "df['messages'] = df['messages'].apply(process_content)" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | text | \n", "IPA2 | \n", "
---|---|---|
0 | \n", "@@ | \n", "@@ | \n", "
1 | \n", "... | \n", "... | \n", "
2 | \n", "! | \n", "! | \n", "
3 | \n", "? | \n", "? | \n", "
4 | \n", "## | \n", "## | \n", "