{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "axhNf7ZHqblZ" }, "source": [ "# Model set-up\n" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 338, "referenced_widgets": [ "577cf34de0614f64a76c68fab94eb968", "55daffcad1da49a3bf39a0f25f964c16", "6d1da3c2efcb4f238cf1734e172d7ea8", "b3249294ef9349a484a3b2f6199c1807", "a1f4fcb471da44878b42cc16f9791034", "9af1074a7ccc43f893c0a0b216cf1c0b", "1ad78a56bf4b4555b58676c6849b548b", "e8a023b93abb4efd9a664a8e270efedf", "779dd02a5f7f49dcbb631e900b4dea30", "6da91cce06ed496abfd08c19e99178db", "283a21c83451452d8969f6411397bb16", "fefcb0aae6e940c6b70fafbf4788ea0b", "9c9d71b94fac443fac8d17599474f819", "150019c3216543c6b4f489e30312e576", "9631f2bc95724c28bcdb2393c91aced7", "e4b7fd9d52df46328c6b6029409825b6", "4d92351da2e946eb891e768dd6e7f3c6", "32020436e6d04da693d120e5fc1a3ed3", "477c0d00dbfd45c3be98a6ff8978548d", "f1954f797f724e9383a5bc204185915e", "f285c11341cc4b1bb52b26016521c33e", "2f1cff6ac11a49088b3b5082617b24e4", "a0045505bdc04c4c96c4c11889a15b91", "caec94d8946b4de2a198bbbc529b636f", "ea3f1e329a614f50995713e654272d09", "d040060ce8ee425691eafeaa30a463c8", "169719f8fb5b415694731802bfd13eff", "087e6131e2204e85976c0b7d9814c066", "0961d49693c64270951f7c845cacc374", "b36a1c036916422fb093fe0403428e2e", "ce11e3708d9d4f5e8916d38d136dcba8", "5a9ce9ccbae1461384d06d5ee1bbce85", "de9e97e95fc84ae3a665ea7f35465ec0", "b7a90f8a364b4ac9b60c4ec6abef89c3", "1b19f33407cd44e4a4329e8cc1144d73", "56e1282076b143aa95a62c70a99d7373", "e196b029379d497c8cae678d23ee2ce6", "f6dbb7249c1f4ecd8a29f3d21238eef1", "7006f450d55b4e59ab2a834ef016c11a", "ba712b4d81924176881021e95484ced9", "ed73d5ddb3864006b175fe52ef795fb4", "435a2cc7ffb14827a90ec445fb8d9977", "e1c71907bb7c40ffb79b8cf52888052a", "4427805fbf7540388c2cee3e5b6123fb", "f5f38f0582b34c8599dabed07199ee9a", "80930655f9254c66b4a676fca773465b", "0991cf83ad504b89a03b1e4021688b78", "6e7269dd900b43899cd2ca4afb0cc23e", "ac385be2db5144e8a591837aaf403874", "d1ed211f2ad24d80b782946fcf366f67", "0788436212f24f98b90ff780463ff6cb", "9bdd6ab3ea8d4a2d947e7111dc29654b", "2f22a44b0c6f4e1e93e32a6f64caee85", "b9bf6fdd7ec94776ac565c5824aad89c", "f9bd1bfd216941a69c4200096648d208", "9ff030ff167945d289853c74190c8cda", "c467413a1af041ef8f2942d4d84bb50a", "9c8f7f4b6d6a4b0a98a2a86d17ab10f7", "f1376e789ccd4c35877f295384cf9c40", "e7eab4e2d8c448cfa962a9e2540c5270", "ca448d6c9ae0451ab3b3b8bec84b6200", "176633a381ce4caf90de05cf34dfadc9", "01596c886f3949f9858b6d9d1f6f7c79", "354537c460fd476f830d632ae8579e60", "8fed23da4e754ff2b340d00c0cc79250", "cfc7ccfab053430485b94b320cc08cb4" ] }, "id": "az8A_hSIlJcw", "outputId": "0a8e2918-0201-4dce-bedf-c2e72cc8495b" }, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/mnt/e/TOM/Learning/Projects/pronunciation-error-detector/.venv/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] }, { "data": { "text/plain": [ "Wav2Vec2ForCTC(\n", " (wav2vec2): Wav2Vec2Model(\n", " (feature_extractor): Wav2Vec2FeatureEncoder(\n", " (conv_layers): ModuleList(\n", " (0): Wav2Vec2LayerNormConvLayer(\n", " (conv): Conv1d(1, 512, kernel_size=(10,), stride=(5,))\n", " (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n", " (activation): GELUActivation()\n", " )\n", " (1-4): 4 x Wav2Vec2LayerNormConvLayer(\n", " (conv): Conv1d(512, 512, kernel_size=(3,), stride=(2,))\n", " (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n", " (activation): GELUActivation()\n", " )\n", " (5-6): 2 x Wav2Vec2LayerNormConvLayer(\n", " (conv): Conv1d(512, 512, kernel_size=(2,), stride=(2,))\n", " (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n", " (activation): GELUActivation()\n", " )\n", " )\n", " )\n", " (feature_projection): Wav2Vec2FeatureProjection(\n", " (layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)\n", " (projection): Linear(in_features=512, out_features=1024, bias=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " )\n", " (encoder): Wav2Vec2EncoderStableLayerNorm(\n", " (pos_conv_embed): Wav2Vec2PositionalConvEmbedding(\n", " (conv): ParametrizedConv1d(\n", " 1024, 1024, kernel_size=(128,), stride=(1,), padding=(64,), groups=16\n", " (parametrizations): ModuleDict(\n", " (weight): ParametrizationList(\n", " (0): _WeightNorm()\n", " )\n", " )\n", " )\n", " (padding): Wav2Vec2SamePadLayer()\n", " (activation): GELUActivation()\n", " )\n", " (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (layers): ModuleList(\n", " (0-23): 24 x Wav2Vec2EncoderLayerStableLayerNorm(\n", " (attention): Wav2Vec2SdpaAttention(\n", " (k_proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (v_proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (q_proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " (out_proj): Linear(in_features=1024, out_features=1024, bias=True)\n", " )\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " (layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", " (feed_forward): Wav2Vec2FeedForward(\n", " (intermediate_dropout): Dropout(p=0.0, inplace=False)\n", " (intermediate_dense): Linear(in_features=1024, out_features=4096, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " (output_dense): Linear(in_features=4096, out_features=1024, bias=True)\n", " (output_dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (final_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)\n", " )\n", " )\n", " )\n", " )\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (lm_head): Linear(in_features=1024, out_features=40, bias=True)\n", ")" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import torch\n", "import librosa\n", "import soundfile as sf\n", "from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC\n", "\n", "# Load the processor and model\n", "MODEL_NAME = \"mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme\" # wav2vec based phoneme trascriber trained on L2-ARTIC\n", "processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)\n", "model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)\n", "model.eval()\n", "\n", "# Check device availability\n", "device = \"cpu\"\n", "model.to(device)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "# Load model directly\n", "from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq\n", "\n", "whisper_processor = AutoProcessor.from_pretrained(\"openai/whisper-tiny.en\")\n", "whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained(\"openai/whisper-tiny.en\").to(device)\n", "\n", "# # Set language to English\n", "# forced_decoder_ids = whisper_processor.get_decoder_prompt_ids(language=\"en\", task=\"transcribe\")\n", "# whisper_model.config.forced_decoder_ids = forced_decoder_ids\n", "\n", "def transcribe_into_English(audio_input):\n", " # Load audio file\n", " # audio_input = audio_input[0]\n", " audio_input = whisper_processor(audio_input, sampling_rate=16000, return_tensors=\"pt\").to(device)\n", "\n", " # Perform transcription\n", " with torch.no_grad():\n", " generated_ids = whisper_model.generate(audio_input.input_features)\n", "\n", " # Decode the transcription\n", " transcription = whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]\n", " return transcription.lower().strip()" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "# Load and preprocess the audio file\n", "def load_audio(audio_path, target_sr=16000):\n", " \"\"\"Load an audio file and resample it to 16kHz.\"\"\"\n", " audio, sr = librosa.load(audio_path, sr=target_sr)\n", " return audio" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "False" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# helper\n", "temp = {}\n", "'name' in temp" ] }, { "cell_type": "markdown", "metadata": { "id": "5BDwRwcsbmBg" }, "source": [ "# Utils\n" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import re \n", "\n", "def get_nested_position(nested_list, flat_index):\n", " \"\"\"\n", " Finds the nested list and the index within it for a given flat index.\n", "\n", " Args:\n", " nested_list (list of lists): The list of lists.\n", " flat_index (int): The flattened index.\n", "\n", " Returns:\n", " tuple: (nested_list_index, element_index_in_nested_list)\n", " \"\"\"\n", " cumulative_index = 0\n", "\n", " for list_index, sublist in enumerate(nested_list):\n", " # Check if the flat index falls within the current sublist\n", " if cumulative_index + len(sublist) > flat_index:\n", " # Calculate the index within the sublist\n", " element_index = flat_index - cumulative_index\n", " return list_index, element_index\n", " # Update cumulative index\n", " cumulative_index += len(sublist)\n", " \n", " raise IndexError(\"Index out of range for the flattened list.\")\n", "\n", "def label_specific_elements_in_reference(reference, start_word_idx, start_element_idx, end_word_idx, end_element_idx, label):\n", " \"\"\"\n", " Labels elements in a nested list between specified start and end indices (inclusive).\n", "\n", " Args:\n", " reference (list of lists): The original list of lists.\n", " start_word_idx (int): Index of the starting nested list.\n", " start_element_idx (int): Index of the starting element in the start list.\n", " end_word_idx (int): Index of the ending nested list.\n", " end_element_idx (int): Index of the ending element in the end list.\n", " label: The label to attach to the elements.\n", "\n", " Returns:\n", " list of lists: A new list of lists with labels attached where applicable.\n", " \"\"\"\n", " labeled_reference = []\n", " for word_idx, sublist in enumerate(reference):\n", " labeled_sublist = []\n", "\n", " for element_idx, element in enumerate(sublist):\n", " if start_word_idx < end_word_idx:\n", " # Case 1: start_word_idx < end_word_idx\n", " if (\n", " (word_idx > start_word_idx and word_idx < end_word_idx) or\n", " (word_idx == start_word_idx and element_idx >= start_element_idx) or\n", " (word_idx == end_word_idx and element_idx <= end_element_idx)\n", " ):\n", " # Attach the label to elements within the inclusive range\n", " if isinstance(element, tuple):\n", " print(f\"There is already a label at index ({word_idx}, {element_idx})\") \n", " labeled_sublist.append((element, label))\n", " else:\n", " # Keep elements outside the range unchanged\n", " labeled_sublist.append(element)\n", " elif start_word_idx == end_word_idx:\n", " # Case 2: start_word_idx == end_word_idx\n", " if word_idx == start_word_idx and start_element_idx <= element_idx <= end_element_idx:\n", " # Attach the label to elements within the inclusive range\n", " if isinstance(element, tuple):\n", " print(f\"There is already a label at index ({word_idx}, {element_idx})\") \n", " labeled_sublist.append((element, label))\n", " else:\n", " # Keep elements outside the range unchanged\n", " labeled_sublist.append(element)\n", "\n", " labeled_reference.append(labeled_sublist)\n", " \n", " return labeled_reference\n", "\n", "def clean_text(text: str) -> str:\n", " \"\"\"\n", " Remove punctuation from the input string except for special characters \n", " that are part of a word, such as ' in I'm or - in hard-working.\n", "\n", " Parameters:\n", " text (str): Input string to clean.\n", " \n", " Returns:\n", " str: Cleaned string with allowed special characters retained.\n", " \"\"\"\n", " # Allow letters, spaces, apostrophes, and hyphens within words\n", " cleaned_text = re.sub(r'[^\\w\\s\\'-]', '', text) # Remove punctuation except ' and -\n", " cleaned_text = re.sub(r'\\s+', ' ', cleaned_text) # Normalize spaces\n", " return cleaned_text.lower().strip()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "import cmudict\n", "cmu_dict = cmudict.dict()" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "# WORKING: converting functions to class, currently done with the last function in the class\n", "import re\n", "from difflib import SequenceMatcher\n", "from IPython.display import HTML, display\n", "import copy \n", "from IPython.display import HTML, display\n", "from Bio import pairwise2\n", "from Bio.pairwise2 import format_alignment\n", "\n", "class PronunciationAssessment:\n", " def __init__(self, transcript, uttered_phonemes):\n", " # NOTE: removed all long signals ('ː') for compatibility with L2-artic's phoneme set (ssl model training set). American English. \n", " # ground truth phonemes are converted into arpabet first, and then into ipa using the arpabet_to_ipa dict, meaning the arpabet_to_ipa dict contains\n", " # the core ipa phoeneme set\n", "\n", " # NOTE: modifications to the list in https://www.dyslexia-reading-well.com/44-phonemes-in-english.html: \n", " # removed 'sc', 'ps', and 'st', in ipa_to_orthography of 's', because I want to assume it's silient \n", " self.ipa_to_orthography = {\n", " 'b': ['b', 'bb'], # Examples: bug, bubble\n", " 'd': ['d', 'dd', 'ed'], # Examples: dad, add, milled\n", " 'f': ['f', 'ff', 'ph', 'gh', 'lf', 'ft'], # Examples: fat, cliff, phone, enough, half, often\n", " 'ɡ': ['g', 'gg', 'gh', 'gu', 'gue'], # Examples: gun, egg, ghost, guest, prologue\n", " 'h': ['h', 'wh'], # Examples: hop, who\n", " 'dʒ': ['j', 'ge', 'g', 'dge', 'di', 'gg'], # Examples: jam, wage, giraffe, edge, soldier, exaggerate\n", " 'k': ['k', 'c', 'ch', 'cc', 'lk', 'qu', 'q(u)', 'ck', 'x'], # Examples: kit, cat, chris, accent, folk, bouquet, queen, rack, box\n", " 'l': ['l', 'll'], # Examples: live, well\n", " 'm': ['m', 'mm', 'mb', 'mn', 'lm'], # Examples: man, summer, comb, column, palm\n", " 'n': ['n', 'nn', 'kn', 'gn', 'pn', 'mn'], # Examples: net, funny, know, gnat, pneumonic, mnemonic\n", " 'p': ['p', 'pp'], # Examples: pin, dippy\n", " 'r': ['r', 'rr', 'wr', 'rh'], # Examples: run, carrot, wrench, rhyme\n", " 'ɹ': ['r', 'rr', 'wr', 'rh'], # Examples: run, carrot, wrench, rhyme\n", " 's': ['s', 'ss', 'c', 'ce', 'se'], # Examples: sit, less, circle, scene, psycho, listen, pace, course\n", " 't': ['t', 'tt', 'th', 'ed'], # Examples: tip, matter, thomas, ripped\n", " 'v': ['v', 'f', 'ph', 've'], # Examples: vine, of, stephen, five\n", " 'w': ['w', 'wh', 'u', 'o'], # Examples: wit, why, quick, choir\n", " 'z': ['z', 'zz', 's', 'ss', 'x', 'ze', 'se'], # Examples: zed, buzz, his, scissors, xylophone, craze\n", " 'ʒ': ['s', 'si', 'z'], # Examples: treasure, division, azure\n", " 'tʃ': ['ch', 'tch', 'tu', 'te'], # Examples: chip, watch, future, righteous\n", " 'ʃ': ['sh', 'ce', 's', 'ci', 'si', 'ch', 'sci', 'ti'], # Examples: sham, ocean, sure, special, pension, machine, conscience, station\n", " 'θ': ['th'], # Example: thongs\n", " 'ð': ['th'], # Example: leather\n", " 'ŋ': ['ng', 'n', 'ngue'], # Examples: ring, pink, tongue\n", " 'j': ['y', 'i', 'j'], # Examples: you, onion, hallelujah\n", " 'æ': ['a', 'ai', 'au'], # Examples: cat, plaid, laugh\n", " 'eɪ': ['a', 'ai', 'eigh', 'aigh', 'ay', 'er', 'et', 'ei', 'au', 'a_e', 'ea', 'ey'], # Examples: bay, maid, weigh, straight, pay, foyer, filet, eight, gauge, mate, break, they\n", " 'ɛ': ['e', 'ea', 'u', 'ie', 'ai', 'a', 'eo', 'ei', 'ae'], # Examples: end, bread, bury, friend, said, many, leopard, heifer, aesthetic\n", " 'i': ['e', 'ee', 'ea', 'y', 'ey', 'oe', 'ie', 'i', 'ei', 'eo', 'ay'], # Examples: be, bee, meat, lady, key, phoenix, grief, ski, deceive, people, quay\n", " 'ɪ': ['i', 'e', 'o', 'u', 'ui', 'y', 'ie'], # Examples: it, england, women, busy, guild, gym, sieve\n", " 'aɪ': ['i', 'y', 'igh', 'ie', 'uy', 'ye', 'ai', 'is', 'eigh', 'i_e'], # Examples: spider, sky, night, pie, guy, stye, aisle, island, height, kite\n", " 'ɒ': ['a', 'ho', 'au', 'aw', 'ough'], # Examples: swan, honest, maul, slaw, fought\n", " 'oʊ': ['o', 'oa', 'o_e', 'oe', 'ow', 'ough', 'eau', 'oo', 'ew'], # Examples: open, moat, bone, toe, sow, dough, beau, brooch, sew\n", " 'ʊ': ['o', 'oo', 'u', 'ou'], # Examples: wolf, look, bush, would\n", " 'ʌ': ['u', 'o', 'oo', 'ou'], # Examples: lug, monkey, blood, double\n", " 'u': ['o', 'oo', 'ew', 'ue', 'u_e', 'oe', 'ough', 'ui', 'oew', 'ou'], # Examples: who, loon, dew, blue, flute, shoe, through, fruit, manoeuvre, group\n", " 'ɔɪ': ['oi', 'oy', 'uoy'], # Examples: join, boy, buoy\n", " 'aʊ': ['ow', 'ou', 'ough'], # Examples: now, shout, bough\n", " 'ə': ['o', 'a', 'er', 'i', 'ar', 'our', 'ur', 'e'], # Examples: about, ladder, pencil, dollar, honour, augur\n", " 'eəʳ': ['air', 'are', 'ear', 'ere', 'eir', 'ayer'], # Examples: chair, dare, pear, where, their, prayer\n", " 'a': ['a'], # Example: arm\n", " 'ɜʳ': ['ir', 'er', 'ur', 'ear', 'or', 'our', 'yr'], # Examples: bird, term, burn, pearl, word, journey, myrtle\n", " 'ɔ': ['aw', 'a', 'au', 'or', 'ore', 'oar', 'our', 'augh', 'ar', 'ough'], # Examples: law, ball, haul,\n", " 'ɪəʳ': ['ear', 'eer', 'ere', 'ier'], # Examples: beer, fear, here, tier\n", " 'ʊəʳ': ['ure', 'our'], # Examples: sure, tour\n", "\n", " # Dialectal Variations\n", " 'ɚ': ['er', 'ir', 'ur', 'ar', 'or'], # Examples: butter, bird, dollar\n", " 'ɝ': ['er', 'ir', 'ur'], # Examples: herd, third, turn\n", " 'ʍ': ['wh'], # Examples: where, which, whale\n", " 'ɑ': ['a', 'ah'], # Examples: father, spa\n", " 'oʊ': ['o', 'ow', 'oe', 'ough', 'ew'] # Examples: go, snow, foe, though, sew\n", " }\n", "\n", " self.arpabet_to_ipa = {\n", " \"AA\": \"a\", # odd\n", " \"AE\": \"æ\", # at\n", " # \"AH\": \"ə\", # hut\n", " \"AO\": \"ɔ\", # ought\n", " \"AW\": \"aʊ\", # cow \n", " \"AX\": \"ə\", # discus\n", " \"AY\": \"aɪ\", # hide\n", " \"B\": \"b\", # be\n", " \"CH\": \"tʃ\", # cheese\n", " \"D\": \"d\", # dee\n", " \"DH\": \"ð\", # thee\n", " \"EH\": \"ɛ\", # Ed\n", " # \"ER\": \"ɝ\", # hurt\n", " \"EY\": \"eɪ\", # ate\n", " \"F\": \"f\", # fee\n", " \"G\": \"ɡ\", # green\n", " \"HH\": \"h\", # he\n", " \"IH\": \"ɪ\", # it\n", " \"IY\": \"i\", # eat\n", " \"JH\": \"dʒ\", # gee\n", " \"K\": \"k\", # key\n", " \"L\": \"l\", # lee\n", " \"M\": \"m\", # me\n", " \"N\": \"n\", # knee\n", " \"NG\": \"ŋ\", # ping\n", " \"OW\": \"oʊ\", # oat\n", " \"OY\": \"ɔɪ\", # toy\n", " \"P\": \"p\", # pee\n", " \"R\": \"ɹ\", # read\n", " \"S\": \"s\", # sea\n", " \"SH\": \"ʃ\", # she\n", " \"T\": \"t\", # tea\n", " \"TH\": \"θ\", # theta\n", " \"UH\": \"ʊ\", # hood\n", " \"UW\": \"u\", # two\n", " \"V\": \"v\", # vee\n", " \"W\": \"w\", # we\n", " \"Y\": \"j\", # yield\n", " \"Z\": \"z\", # zee\n", " \"ZH\": \"ʒ\", # seizure\n", "\n", " # Vowels with stress affecting IPA\n", " \"AH0\": \"ə\", # unstressed (about)\n", " \"AH1\": \"ʌ\", # stressed (hut)\n", " \"AH2\": \"ʌ\", # secondary stress (hut)\n", " \"ER0\": \"ɚ\", # unstressed (runner)\n", " \"ER1\": \"ɝ\", # stressed (bird)\n", " \"ER2\": \"ɝ\", # secondary stress (bird)\n", " \"EY0\": \"e\", # unstressed (obey)\n", " \"EY1\": \"eɪ\", # stressed (day)\n", " \"EY2\": \"eɪ\", # secondary stress (day)\n", " \"IH0\": \"ɨ\", # unstressed (possible centralization)\n", " \"IH1\": \"ɪ\", # stressed (bit)\n", " \"IH2\": \"ɪ\", # secondary stress (bit)\n", " \"UW0\": \"ʉ\", # unstressed (possible centralization)\n", " \"UW1\": \"u\", # stressed (food)\n", " \"UW2\": \"u\", # secondary stress (food)\n", " \"AO0\": \"ə\", # unstressed (centralized in some accents)\n", " \"AO1\": \"ɔ\", # stressed (thought)\n", " \"AO2\": \"ɔ\", # secondary stress (thought)\n", " \"AE0\": \"ə\", # unstressed (centralized in some accents)\n", " \"AE1\": \"æ\", # stressed (cat)\n", " \"AE2\": \"æ\", # secondary stress (cat)\n", " \"OW0\": \"o\", # unstressed (less diphthongized)\n", " \"OW1\": \"oʊ\", # stressed (go)\n", " \"OW2\": \"oʊ\", # secondary stress (go)\n", " \"UH0\": \"ɨ\", # unstressed (centralized or reduced)\n", " \"UH1\": \"ʊ\", # stressed (put)\n", " \"UH2\": \"ʊ\", # secondary stress (put)\n", "\n", " # unknown phoneme\n", " \"unk\": \"unk\"\n", " }\n", "\n", " # whether the two phonemes are considered correct (value = 1), acceptable (value = 2), or wrong (value = 0)\n", " self.phoneme_pair_label = {\n", " # Completely correct pairs (self-similarity)\n", " **{(p, p): 1 for p in [\n", " 'b', 'd', 'f', 'g', 'h', 'dʒ', 'k', 'l', 'm', 'n', 'p', 'r', 'ɹ', 's', 't', 'v', 'w', 'z', 'ʒ', 'tʃ',\n", " 'ʃ', 'θ', 'ð', 'ŋ', 'j', 'æ', 'eɪ', 'ɛ', 'i', 'ɪ', 'aɪ', 'ɒ', 'oʊ', 'ʊ', 'ʌ', 'u', 'ɔɪ', 'aʊ', 'ə',\n", " 'eəʳ', 'a', 'ɜʳ', 'ɔ', 'ɪəʳ', 'ʊəʳ', 'ɚ', 'ɝ', 'ʍ', 'ɑ', 'ɡ'\n", " ]},\n", "\n", " # Acceptable substitutions (value = 2)\n", " **{pair: 2 for pair in [\n", " ('b', 'p'), ('d', 't'), ('g', 'k'), ('ɡ', 'k'), ('v', 'f'), ('z', 's'), ('ʒ', 'ʃ'), ('ð', 'θ'),\n", " ('m', 'n'), ('m', 'ŋ'), ('n', 'ŋ'), ('r', 'ɹ'), ('l', 'r'), ('l', 'ɹ'), ('w', 'ʍ'),\n", " ('j', 'ɹ'), ('f', 'θ'), ('v', 'ð'), ('s', 'ʃ'), ('z', 'ʒ'), ('tʃ', 'dʒ'), ('tʃ', 'ʃ'),\n", " ('dʒ', 'ʒ'), ('i', 'ɪ'), ('ɪ', 'ɛ'), ('ɛ', 'æ'), ('ə', 'ʌ'), ('ə', 'ɜʳ'), ('ʌ', 'ɜʳ'),\n", " ('ə', 'ɚ'), ('u', 'ʊ'), ('ʊ', 'oʊ'), ('oʊ', 'ɔ'), ('ɔ', 'ɒ'), ('ɑ', 'ɒ'), ('eɪ', 'ɛ'),\n", " ('eɪ', 'æ'), ('aɪ', 'ɪ'), ('aʊ', 'ʊ'), ('ɔɪ', 'ɔ'), ('ɝ', 'ɚ'), ('ɪəʳ', 'ɜʳ'), ('ʊəʳ', 'ɔ'),\n", " ('ð', 'd'), ('ɑ', 'a')\n", " ] + [(b, a) for (a, b) in [\n", " ('b', 'p'), ('d', 't'), ('g', 'k'), ('ɡ', 'k'), ('v', 'f'), ('z', 's'), ('ʒ', 'ʃ'), ('ð', 'θ'),\n", " ('m', 'n'), ('m', 'ŋ'), ('n', 'ŋ'), ('r', 'ɹ'), ('l', 'r'), ('l', 'ɹ'), ('w', 'ʍ'),\n", " ('j', 'ɹ'), ('f', 'θ'), ('v', 'ð'), ('s', 'ʃ'), ('z', 'ʒ'), ('tʃ', 'dʒ'), ('tʃ', 'ʃ'),\n", " ('dʒ', 'ʒ'), ('i', 'ɪ'), ('ɪ', 'ɛ'), ('ɛ', 'æ'), ('ə', 'ʌ'), ('ə', 'ɜʳ'), ('ʌ', 'ɜʳ'),\n", " ('ə', 'ɚ'), ('u', 'ʊ'), ('ʊ', 'oʊ'), ('oʊ', 'ɔ'), ('ɔ', 'ɒ'), ('ɑ', 'ɒ'), ('eɪ', 'ɛ'),\n", " ('eɪ', 'æ'), ('aɪ', 'ɪ'), ('aʊ', 'ʊ'), ('ɔɪ', 'ɔ'), ('ɝ', 'ɚ'), ('ɪəʳ', 'ɜʳ'), ('ʊəʳ', 'ɔ'),\n", " ('ð', 'd'), ('ɑ', 'a')\n", " ] if (b, a) not in [(a, b)]]},\n", "\n", " # Completely wrong pairs (default value = 0)\n", " **{(p1, p2): 0 for p1 in [\n", " 'b', 'd', 'f', 'g', 'h', 'dʒ', 'k', 'l', 'm', 'n', 'p', 'r', 'ɹ', 's', 't', 'v', 'w', 'z', 'ʒ', 'tʃ',\n", " 'ʃ', 'θ', 'ð', 'ŋ', 'j', 'æ', 'eɪ', 'ɛ', 'i', 'ɪ', 'aɪ', 'ɒ', 'oʊ', 'ʊ', 'ʌ', 'u', 'ɔɪ', 'aʊ', 'ə',\n", " 'eəʳ', 'a', 'ɜʳ', 'ɔ', 'ɪəʳ', 'ʊəʳ', 'ɚ', 'ɝ', 'ʍ', 'ɑ', 'ɡ'\n", " ] for p2 in [\n", " 'b', 'd', 'f', 'g', 'h', 'dʒ', 'k', 'l', 'm', 'n', 'p', 'r', 'ɹ', 's', 't', 'v', 'w', 'z', 'ʒ', 'tʃ',\n", " 'ʃ', 'θ', 'ð', 'ŋ', 'j', 'æ', 'eɪ', 'ɛ', 'i', 'ɪ', 'aɪ', 'ɒ', 'oʊ', 'ʊ', 'ʌ', 'u', 'ɔɪ', 'aʊ', 'ə',\n", " 'eəʳ', 'a', 'ɜʳ', 'ɔ', 'ɪəʳ', 'ʊəʳ', 'ɚ', 'ɝ', 'ʍ', 'ɑ', 'ɡ'\n", " ] if p1 != p2 and (p1, p2) not in [\n", " ('b', 'p'), ('d', 't'), ('g', 'k'), ('ɡ', 'k'), ('v', 'f'), ('z', 's'), ('ʒ', 'ʃ'), ('ð', 'θ'),\n", " ('m', 'n'), ('m', 'ŋ'), ('n', 'ŋ'), ('r', 'ɹ'), ('l', 'r'), ('l', 'ɹ'), ('w', 'ʍ'),\n", " ('j', 'ɹ'), ('f', 'θ'), ('v', 'ð'), ('s', 'ʃ'), ('z', 'ʒ'), ('tʃ', 'dʒ'), ('tʃ', 'ʃ'),\n", " ('dʒ', 'ʒ'), ('i', 'ɪ'), ('ɪ', 'ɛ'), ('ɛ', 'æ'), ('ə', 'ʌ'), ('ə', 'ɜʳ'), ('ʌ', 'ɜʳ'),\n", " ('ə', 'ɚ'), ('u', 'ʊ'), ('ʊ', 'oʊ'), ('oʊ', 'ɔ'), ('ɔ', 'ɒ'), ('ɑ', 'ɒ'), ('eɪ', 'ɛ'),\n", " ('eɪ', 'æ'), ('aɪ', 'ɪ'), ('aʊ', 'ʊ'), ('ɔɪ', 'ɔ'), ('ɝ', 'ɚ'), ('ɪəʳ', 'ɜʳ'), ('ʊəʳ', 'ɔ'),\n", " ('ð', 'd'), ('ɑ', 'a')\n", " ] + [(b, a) for (a, b) in [\n", " ('b', 'p'), ('d', 't'), ('g', 'k'), ('ɡ', 'k'), ('v', 'f'), ('z', 's'), ('ʒ', 'ʃ'), ('ð', 'θ'),\n", " ('m', 'n'), ('m', 'ŋ'), ('n', 'ŋ'), ('r', 'ɹ'), ('l', 'r'), ('l', 'ɹ'), ('w', 'ʍ'),\n", " ('j', 'ɹ'), ('f', 'θ'), ('v', 'ð'), ('s', 'ʃ'), ('z', 'ʒ'), ('tʃ', 'dʒ'), ('tʃ', 'ʃ'),\n", " ('dʒ', 'ʒ'), ('i', 'ɪ'), ('ɪ', 'ɛ'), ('ɛ', 'æ'), ('ə', 'ʌ'), ('ə', 'ɜʳ'), ('ʌ', 'ɜʳ'),\n", " ('ə', 'ɚ'), ('u', 'ʊ'), ('ʊ', 'oʊ'), ('oʊ', 'ɔ'), ('ɔ', 'ɒ'), ('ɑ', 'ɒ'), ('eɪ', 'ɛ'),\n", " ('eɪ', 'æ'), ('aɪ', 'ɪ'), ('aʊ', 'ʊ'), ('ɔɪ', 'ɔ'), ('ɝ', 'ɚ'), ('ɪəʳ', 'ɜʳ'), ('ʊəʳ', 'ɔ'),\n", " ('ð', 'd'), ('ɑ', 'a')\n", " ] if (b, a) not in [(a, b)]]}\n", " }\n", "\n", " self.ipa_phonemes = list(self.ipa_to_orthography.keys())\n", " self.ipa_phonemes.append('unk')\n", "\n", " # instance-specific variables\n", " self.transcript = transcript.lower().strip()\n", " self.uttered_ipa_phonemes = uttered_phonemes\n", " self.ground_truth_arpabet_phonemes = \"\"\n", " self.ground_truth_ipa_phonemes = \"\"\n", "\n", " self.segmented_uttered_ipa_phonemes = []\n", " self.segmented_ground_truth_arpabet_phonemes = []\n", " self.segmented_ground_truth_ipa_phonemes = []\n", "\n", " def get_phoneme_count(self):\n", " return len(self.ipa_phonemes)\n", "\n", " def has_phoneme(self, phoneme): \n", " return phoneme in self.ipa_phonemes\n", "\n", " def convert_transcript_into_phonemes(self, get_all_versions=True):\n", " \"\"\"\n", " Parameters:\n", " get_all_versions (bool): Default to True. Whether to return all possible phoneme versions for each word.\n", " Convert a list of word into IPA phonems through ARPABET phonemes.\n", "\n", " Returns: \n", " bool: If the conversion is successful.\n", " \"\"\"\n", " if len(self.transcript) == 0: \n", " return False\n", " \n", " arap_phonemes = []\n", " for word in self.transcript.split():\n", " if len(cmu_dict[word]) != 0:\n", " if not get_all_versions:\n", " arpa_phons = self.clean_single_arpabet_phoneme_list(cmu_dict[word][0])\n", " else:\n", " phon_vers = cmu_dict[word]\n", " arpa_phons = [self.clean_single_arpabet_phoneme_list(phons) for phons in phon_vers]\n", " arap_phonemes.append(arpa_phons) # Use the first phoneme representation\n", " else:\n", " arap_phonemes.append([['unk']]) # Append 'UNK' for unknown words\\\n", "\n", " self.segmented_ground_truth_arpabet_phonemes = arap_phonemes\n", " if not get_all_versions:\n", " ipa_phonemes = []\n", " for word in arap_phonemes:\n", " cur_phonemes = []\n", " for phon in word:\n", " cur_phonemes.append(self.arpabet_to_ipa[phon])\n", " ipa_phonemes.append(cur_phonemes)\n", " else: \n", " ipa_phonemes = []\n", " for word in arap_phonemes:\n", " cur_word = []\n", " for ver in word:\n", " cur_ver = []\n", " for phon in ver:\n", " cur_ver.append(self.arpabet_to_ipa[phon])\n", " cur_word.append(cur_ver)\n", " ipa_phonemes.append(cur_word)\n", "\n", " self.segmented_ground_truth_ipa_phonemes = ipa_phonemes\n", " return True\n", " \n", " def remove_ipa_stress_markers(self, phonemes):\n", " \"\"\"\n", " Parameters:\n", " phonemes (str): A string of phonemes (e.g. \"ˈɪŋɡlɪʃ\")\n", " \"\"\"\n", " return re.sub(r\"[ˈˌ]\", \"\", phonemes)\n", " \n", " def remove_ipa_length_markers(self, phonemes):\n", " \"\"\"\n", " Parameters:\n", " phonemes (str): A string of phonemes (e.g. \"ˈɪŋɡlɪʃ\")\n", " \"\"\"\n", " return re.sub(r\"[ːˑ]\", \"\", phonemes)\n", " \n", " def remove_ipa_break_markers(self, phonemes):\n", " \"\"\"\n", " Parameters:\n", " phonemes (str): A string of phonemes (e.g. \"ˈɪŋɡlɪʃ\")\n", " \"\"\"\n", " return re.sub(r\"[.‖|]\", \"\", phonemes)\n", " \n", " def remove_ipa_tone_markers(self, phonemes):\n", " \"\"\"\n", " Parameters:\n", " phonemes (str): A string of phonemes (e.g. \"ˈɪŋɡlɪʃ\")\n", " \"\"\"\n", " return re.sub(r\"[˥˦˧˨˩]\", \"\", phonemes)\n", " \n", " def remove_ipa_global_markers(self, phonemes):\n", " \"\"\"\n", " Parameters:\n", " phonemes (str): A string of phonemes (e.g. \"ˈɪŋɡlɪʃ\")\n", " \"\"\"\n", " return re.sub(r\"[↗↘]\", \"\", phonemes)\n", " \n", " def remove_ipa_diacritics(self, phonemes):\n", " \"\"\"\n", " Parameters:\n", " phonemes (str): A string of phonemes (e.g. \"ˈɪŋɡlɪʃ\")\n", " \"\"\"\n", " return re.sub(r\"[̩̯̪̠̟̹̜̬̥̤̰̼̩̯̝̞̊̃̚]\", \"\", phonemes)\n", " \n", " def remove_tie_bars(self, phonemes):\n", " \"\"\"\n", " Removes all tie bars (͡) from a string of phonemes.\n", "\n", " Parameters:\n", " phonemes (str): A string of phonemes (e.g. \"ˈɪŋɡlɪʃ\")\n", " \"\"\"\n", " return phonemes.replace('͡', '')\n", " \n", " def correct_shenanigans(self, ipa_phonemes):\n", " \"\"\"\n", " Manually correct phoneme-related problems, mostly arising from converstion from arpabet to ipa or from ssl's inferred ipa\n", " Parameters:\n", " ipa_phonemes (list of lists): Nested list of phonemes.\n", " \"\"\"\n", " new_ipa_phonemes = \"\"\n", " for word in ipa_phonemes.split():\n", " if len(new_ipa_phonemes) > 0:\n", " new_ipa_phonemes += \" \" \n", " cur_word = \"\"\n", " for i, phoneme in enumerate(list(word)):\n", " if phoneme == \"ʌ\":\n", " if i == 0 or i == len(word) - 1 or len(word) > 4:\n", " cur_word += \"ə\"\n", " else:\n", " cur_word += phoneme\n", " else:\n", " cur_word += phoneme\n", " new_ipa_phonemes += cur_word\n", " return new_ipa_phonemes\n", " \n", " def clean_ipa_phonemes(self):\n", " \"\"\"\n", " Clean uttered phonemes by removing stress, length, break, tone, global, and diacritic markers, as well as tie bars.\n", " \"\"\"\n", " phonemes = self.uttered_ipa_phonemes\n", " phonemes = self.remove_ipa_stress_markers(phonemes)\n", " phonemes = self.remove_ipa_length_markers(phonemes)\n", " phonemes = self.remove_ipa_break_markers(phonemes)\n", " phonemes = self.remove_ipa_tone_markers(phonemes)\n", " phonemes = self.remove_ipa_global_markers(phonemes)\n", " phonemes = self.remove_ipa_diacritics(phonemes)\n", " phonemes = self.remove_tie_bars(phonemes)\n", " phonemes = self.correct_shenanigans(phonemes)\n", " self.uttered_ipa_phonemes = phonemes\n", " \n", " return True\n", " \n", " def remove_stress_indicator_from_arpabet_phonemes(self, arpabet_phoneme_list):\n", " \"\"\"\n", " Remove all stress markers (trailing numbers), excluding AH and ER (due to their nuances, refer to the arpa_to_ipa dict for detail)\n", "\n", " Parameters:\n", " arpabet_phoneme_list (list of lists): Nested list of phonemes.\n", "\n", " Returns:\n", " list of lists: Updated nested list with numbers removed from phonemes.\n", " \"\"\"\n", " cleaned_phon_list = []\n", " for word_phonemes in arpabet_phoneme_list:\n", " cleaned_phon_list = []\n", " for phoneme in word_phonemes:\n", " if not phoneme.startswith(('AH', 'ER')):\n", " cleaned_phon_list.append(re.sub(r'\\d', '', phoneme))\n", " else:\n", " cleaned_phon_list.append(phoneme)\n", " cleaned_phon_list.append(cleaned_phon_list)\n", "\n", " return cleaned_phon_list\n", " \n", " def remove_stress_indicator_from_single_arpabet_phoneme_list(self, phon_list):\n", " \"\"\"\n", " Remove all stress markers (trailing numbers), excluding AH and ER (due to their nuances, refer to the arpa_to_ipa dict for detail)\n", "\n", " Parameters:\n", " phon_list (list(str)): The list of arpabet phoneme\n", "\n", " Returns:\n", " str: Updated phoneme with numbers removed.\n", " \"\"\"\n", " cleaned_phon_list = []\n", " for phoneme in phon_list:\n", " if not phoneme.startswith(('AH', 'ER')):\n", " cleaned_phon_list.append(re.sub(r'\\d', '', phoneme))\n", " else:\n", " cleaned_phon_list.append(phoneme)\n", "\n", " return cleaned_phon_list\n", " \n", " def clean_arpabet_phonemes(self, arpabet_phoneme_list):\n", " \"\"\"\n", " Parameters:\n", " arpabet_phoneme_list (list of lists): Nested list of phonemes.\n", " \"\"\"\n", " cleaned_phonemes = self.remove_stress_indicator_from_arpabet_phonemes(arpabet_phoneme_list)\n", " return cleaned_phonemes\n", " \n", " def clean_single_arpabet_phoneme_list(self, phon_list):\n", " \"\"\"\n", " Parameters:\n", " phon_list (list(str)): The list of arpabet phoneme\n", " \"\"\"\n", " cleaned_phon = self.remove_stress_indicator_from_single_arpabet_phoneme_list(phon_list)\n", " return cleaned_phon\n", " \n", " def split_phoneme_sequence(self):\n", " \"\"\"\n", " Splits a the uttered phoneme sequence (of a string of phoneme with each word separated by a space) into individual phonemes based on the IPA dictionary keys.\n", " \"\"\"\n", " sequence = self.uttered_ipa_phonemes.strip()\n", " i = 0\n", " keys = sorted(self.ipa_phonemes, key=len, reverse=True) # Prioritize longer matches\n", " \n", " sequence_phonemes = []\n", " word_phonemes = []\n", " while i < len(sequence):\n", " # if reaches the end of a word\n", " if sequence[i] == ' ':\n", " if word_phonemes:\n", " sequence_phonemes.append(word_phonemes)\n", " word_phonemes = []\n", " i += 1\n", " continue\n", " match = None\n", "\n", " # otherwise\n", " for key in keys:\n", " if sequence[i:i+len(key)] == key:\n", " match = key\n", " word_phonemes.append(match)\n", " i += len(key)\n", " break\n", " if not match: # No phoneme matched\n", " word_phonemes.append('unk')\n", " i += 1\n", "\n", " if word_phonemes:\n", " sequence_phonemes.append(word_phonemes)\n", " self.segmented_uttered_ipa_phonemes = sequence_phonemes\n", " \n", " def evaluate_pronunciation(self, reference: list, pronunciation: list):\n", " \"\"\"\n", " Evaluate the pronunciation of a word or sentence by comparing it to a reference.\n", " \n", " Args:\n", " reference (list(list(str))): List of words, each word is a list of phonemes representing the correct pronunciation.\n", " pronunciation (list(list(str))): List of words, each word is a list of phonemes representing the pronunciation to be evaluated.\n", "\n", " Returns:\n", " list(dict): A list of dictionaries (one for each word) containing the evaluation results.\n", " \"\"\"\n", " smushed_ref = []\n", " smushed_pron = []\n", "\n", " smushed_ref = [item for word in reference for item in word]\n", " smushed_pron = [item for word in pronunciation for item in word]\n", "\n", " matcher = SequenceMatcher(None, smushed_ref, smushed_pron)\n", " alignment = matcher.get_opcodes()\n", " \n", " # Initialize results for errors and labels\n", " errors = {\"matches\": [], \"substitutions\": [], \"insertions\": [], \"deletions\": []}\n", " labels = copy.deepcopy(reference)\n", " processed_indices = set() # Track indices in the reference that are processed\n", " \n", " # Process each alignment operation\n", " for tag, i1, i2, j1, j2 in alignment:\n", " if tag == \"equal\":\n", " # Matches: Add to errors and label as 1\n", " errors[\"matches\"].extend(smushed_ref[i1:i2])\n", " start_word_idx, start_element_idx = get_nested_position(reference, i1)\n", " end_word_idx, end_element_idx = get_nested_position(reference, i2 - 1)\n", "\n", " labels = label_specific_elements_in_reference(labels, start_word_idx, start_element_idx, end_word_idx, end_element_idx, 1)\n", " # labels.extend([(phoneme, 1) for phoneme in reference[i1:i2]])\n", " processed_indices.update(range(i1, i2))\n", " elif tag == \"replace\":\n", " # Substitutions: Check phoneme-by-phoneme\n", " ref_segment = smushed_ref[i1:i2]\n", " pron_segment = smushed_pron[j1:j2]\n", " # go through each pair of phoneme in ref and pron segment, if they are labeled 2 or 1 in the phoneme_pair_label, remove them as mistakes\n", " original_i1 = i1\n", " original_i2 = i2\n", " for ref_phoneme, pron_phoneme in zip(ref_segment, pron_segment):\n", " if (ref_phoneme, pron_phoneme) in self.phoneme_pair_label:\n", " if self.phoneme_pair_label[(ref_phoneme, pron_phoneme)] in [1, 2]:\n", " processed_indices.add(i1)\n", " i1 += 1 # Move to the next index in the reference\n", " j1 += 1 # Move to the next index in the pronunciation\n", " \n", " if i1 > original_i1:\n", " start_word_idx, start_element_idx = get_nested_position(reference, original_i1)\n", " end_word_idx, end_element_idx = get_nested_position(reference, i1 - 1)\n", " labels = label_specific_elements_in_reference(labels, start_word_idx, start_element_idx, end_word_idx, end_element_idx, 1)\n", " \n", " if i1 >= original_i2: # if no more phoneme in reference left to process\n", " continue\n", "\n", " start_word_idx, start_element_idx = get_nested_position(reference, i1)\n", " end_word_idx, end_element_idx = get_nested_position(reference, i2 - 1)\n", "\n", " labels = label_specific_elements_in_reference(labels, start_word_idx, start_element_idx, end_word_idx, end_element_idx, 0)\n", " processed_indices.update(range(i1, i2)) \n", "\n", " for ref_phoneme, pron_phoneme in zip(ref_segment, pron_segment):\n", " if ref_phoneme != pron_phoneme:\n", " errors[\"substitutions\"].append((ref_phoneme, pron_phoneme))\n", " # labels.append((ref_phoneme, 0))\n", " processed_indices.add(i1)\n", " i1 += 1 # Move to the next index in the reference\n", " \n", " # Handle leftover phonemes in reference (deletions)\n", " if len(ref_segment) > len(pron_segment):\n", " for leftover in ref_segment[len(pron_segment):]:\n", " errors[\"deletions\"].append(leftover)\n", " # labels.append((leftover, 0))\n", " processed_indices.add(i1)\n", " i1 += 1\n", " \n", " # Handle leftover phonemes in pronunciation (insertions)\n", " if len(pron_segment) > len(ref_segment):\n", " for leftover in pron_segment[len(ref_segment):]:\n", " errors[\"insertions\"].append(leftover)\n", " elif tag == \"insert\":\n", " # Insertions: Add to errors, no effect on reference labels\n", " errors[\"insertions\"].extend(smushed_pron[j1:j2])\n", " elif tag == \"delete\":\n", " # Deletions: Add to errors and label as 0\n", " errors[\"deletions\"].extend(smushed_ref[i1:i2])\n", " start_word_idx, start_element_idx = get_nested_position(reference, i1)\n", " end_word_idx, end_element_idx = get_nested_position(reference, i2 - 1)\n", "\n", " labels = label_specific_elements_in_reference(labels, start_word_idx, start_element_idx, end_word_idx, end_element_idx, 0)\n", " # labels.extend([(phoneme, 0) for phoneme in reference[i1:i2]])\n", " processed_indices.update(range(i1, i2))\n", " \n", " # Post-check: Ensure all phonemes in the reference are processed\n", " for i, phoneme in enumerate(smushed_ref):\n", " if i not in processed_indices:\n", " errors[\"deletions\"].append(phoneme)\n", " start_word_idx, start_element_idx = get_nested_position(reference, i)\n", " end_word_idx, end_element_idx = get_nested_position(reference, i)\n", "\n", " labels = label_specific_elements_in_reference(labels, start_word_idx, start_element_idx, end_word_idx, end_element_idx, 0)\n", " # labels.append((phoneme, 0))\n", " \n", " return errors, labels\n", " \n", " def map_boundary(self, segmented_ground_truth_list, segmented_uttered_list):\n", " \"\"\"\n", " Maps the boundaries of each word in the ground truth to the corresponding part in the uttered list.\n", " Rewrites to self.segmented_uttered_ipa_phonemes\n", " Args:\n", " segmented_ground_truth_list (list): A single list of phonemes, word are separed with space \n", " segmented_uttered_list (list): A single list of phonemes, word are separed with space\n", " \"\"\"\n", " \n", " alignments = pairwise2.align.globalms(\n", " segmented_ground_truth_list, segmented_uttered_list, \n", " match=1, # Score for match\n", " mismatch=-1, # Penalty for mismatch\n", " open=-2, # Penalty for opening a gap\n", " extend=-1, # Penalty for extending a gap,\n", " gap_char=['-']\n", " )\n", " best_alignment = alignments[0]\n", "\n", " # Extract the aligned sequences\n", " aligned_ground_truth = best_alignment.seqA\n", " aligned_uttered = best_alignment.seqB\n", "\n", " # Process the alignment to group corresponding characters\n", " segments = []\n", " current_segment = []\n", " for g_char, u_char in zip(aligned_ground_truth, aligned_uttered):\n", " if g_char == \" \": # Word boundary in ground truth\n", " if current_segment: # Append collected segment\n", " segments.append(current_segment)\n", "\n", " current_segment = []\n", " else:\n", " if g_char != \"-\" and g_char != \" \": # Only consider characters from uttered list\n", " current_segment.append(u_char)\n", "\n", " # Append the last segment, if any\n", " if current_segment:\n", " segments.append(current_segment)\n", "\n", " # Output the segmented uttered list\n", " self.segmented_uttered_ipa_phonemes = segments\n", " \n", " def evaluate_full_pronunciation(self):\n", " \"\"\"\n", " Evaluates the full pronunciation of the utterance against the ground truth. \n", " self.segmented_ground_truth_ipa_phonemes and self.segmented_uttered_ipa_phonemes need to be available.\n", " \"\"\"\n", " if len(self.segmented_ground_truth_ipa_phonemes) == 0 or len(self.segmented_uttered_ipa_phonemes) == 0:\n", " raise ValueError(\"Segmented ground truth and uttered phonemes must be non-empty.\")\n", " \n", " one_ground_truth = []\n", " one_uttered = []\n", " for word in self.segmented_ground_truth_ipa_phonemes:\n", " one_ground_truth.extend(word[0])\n", " one_ground_truth.append(\" \")\n", " for word in self.segmented_uttered_ipa_phonemes:\n", " one_uttered.extend(word)\n", " one_uttered.append(\" \")\n", "\n", " # correctly add spaces to uttered phonemes\n", " self.map_boundary(one_ground_truth, one_uttered)\n", "\n", " final_label_list = []\n", " for reference, uttered in zip(self.segmented_ground_truth_ipa_phonemes, self.segmented_uttered_ipa_phonemes):\n", " final_label_list.append(self.evaluate_pronunciation_for_word(uttered, reference))\n", " return final_label_list\n", " \n", " def evaluate_pronunciation_for_word(self, uttered: list, reference: list):\n", " \"\"\"\n", " Evaluates pronunciation for a word.\n", " \n", " Args:\n", " uttered (list): A list of phonemes representing the uttered phonemes for this word.\n", " reference (list): A list of list, each nested list being a possible pronunciation (ground truth) of the word.\n", " \n", " Returns:\n", " list(tuple): Each tuple is (phoneme_label)\n", " \"\"\"\n", " max_score = None\n", " final_label_list = []\n", " for ground_truth in reference:\n", " score = 0\n", " label_list = []\n", "\n", " alignments = pairwise2.align.globalms(\n", " ground_truth, uttered, \n", " match=1, # Score for match\n", " mismatch=-1, # Penalty for mismatch\n", " open=-2, # Penalty for opening a gap\n", " extend=-1, # Penalty for extending a gap,\n", " gap_char=['-']\n", " )\n", "\n", " # Extract the aligned sequences\n", " aligned_ground_truth, aligned_uttered, _, _, _ = alignments[0]\n", "\n", " # Iterate through the characters in the aligned sequences\n", " for gt_char, utt_char in zip(aligned_ground_truth, aligned_uttered):\n", " # Skip gaps in the ground truth\n", " if gt_char == '-' or gt_char == ' ':\n", " continue\n", "\n", " # Assign a label based on the tuple (gt_char, utt_char)\n", " if utt_char != '-': # Only consider matched characters, not gaps in uttered\n", " key = (gt_char, utt_char)\n", " if key in self.phoneme_pair_label and self.phoneme_pair_label[key] in [1, 2]:\n", " label = 1\n", " score += 1\n", " else:\n", " label = 0\n", " score -= 1\n", " else:\n", " label = 0 # Default label for unmatched characters\n", " score -= 1\n", " # Append the result as a tuple (ground_truth_char, label)\n", " label_list.append((gt_char, label))\n", "\n", " if max_score is None or score > max_score:\n", " max_score = score\n", " final_label_list = label_list\n", " # Return the label list\n", " return final_label_list\n", "\n", " def map_phonemes_to_segments(self, phoneme_labels, word):\n", " \"\"\"\n", " Maps each phoneme in the phoneme set to its corresponding segment (orthography) in the word.\n", " \n", " Args:\n", " phoneme_labels (list): List of phoneme labels in order.\n", " word (str): The word to map the phonemes to.\n", "\n", " Returns:\n", " list: List of tuples, each containing a phoneme and its corresponding segment.\n", " \"\"\"\n", " result = []\n", " remaining_word = word\n", "\n", " for phoneme_tup in phoneme_labels:\n", " phoneme = phoneme_tup[0]\n", " \n", " if phoneme not in self.ipa_to_orthography:\n", " # Skip the phoneme if not found in the map\n", " continue\n", "\n", " possible_spellings = self.ipa_to_orthography[phoneme]\n", " # Sort spellings by length in descending order to prioritize the longest match\n", " possible_spellings.sort(key=len, reverse=True)\n", "\n", " matched_spelling = None\n", " skipped_characters = []\n", "\n", " while remaining_word: # WORKING: if possible_spellings are not exhaustive, will consider the rest a silient grapheme\n", " for spelling in possible_spellings:\n", " if remaining_word.startswith(spelling):\n", " matched_spelling = spelling\n", " break\n", "\n", " if matched_spelling:\n", " break\n", "\n", " # If no match, treat the current character as part of a silent grapheme\n", " skipped_characters.append(remaining_word[0])\n", " remaining_word = remaining_word[1:]\n", "\n", " if not matched_spelling: # reach the end of word but no match, possibly meaning the possible_spellings are not exhaustive\n", " matched_spelling = \"\" \n", "\n", " # Add skipped characters to the result as silent graphemes\n", " for char in skipped_characters:\n", " result.append((('', 1), char))\n", "\n", " # Add the phoneme and matched spelling to the result\n", " result.append((phoneme_tup, matched_spelling))\n", "\n", " # Update the remaining word by removing the matched spelling\n", " if matched_spelling:\n", " remaining_word = remaining_word[len(matched_spelling):]\n", "\n", " if remaining_word:\n", " result.append((('', 1), remaining_word))\n", " print(f\"Unmapped segment of the word remains: '{remaining_word}'\")\n", "\n", " return result\n", " \n", " def map_phonemes_to_segments_for_api(self, phoneme_labels, word):\n", " \"\"\"\n", " Maps each phoneme in the phoneme set to its corresponding segment (orthography) in the word.\n", " Same as above, but different format to return the API call\n", " Args:\n", " phoneme_labels (list): List of phoneme labels in order.\n", " word (str): The word to map the phonemes to.\n", "\n", " Returns:\n", " list: List of tuples, each containing a phoneme and its corresponding segment.\n", " \"\"\"\n", " result = {\"word\": word, \"details\": []}\n", " remaining_word = word\n", "\n", " for phoneme_tup in phoneme_labels:\n", " phoneme = phoneme_tup[0]\n", " \n", " if phoneme not in self.ipa_to_orthography:\n", " # Skip the phoneme if not found in the map\n", " continue\n", "\n", " possible_spellings = self.ipa_to_orthography[phoneme]\n", " # Sort spellings by length in descending order to prioritize the longest match\n", " possible_spellings.sort(key=len, reverse=True)\n", "\n", " matched_spelling = None\n", " skipped_characters = []\n", "\n", " while remaining_word: # WORKING: if possible_spellings are not exhaustive, will consider the rest a silient grapheme\n", " for spelling in possible_spellings:\n", " if remaining_word.startswith(spelling):\n", " matched_spelling = spelling\n", " break\n", "\n", " if matched_spelling:\n", " break\n", "\n", " # If no match, treat the current character as part of a silent grapheme\n", " skipped_characters.append(remaining_word[0])\n", " remaining_word = remaining_word[1:]\n", "\n", " if not matched_spelling: # reach the end of word but no match, possibly meaning the possible_spellings are not exhaustive\n", " matched_spelling = \"\" \n", "\n", " # Add skipped characters to the result as silent graphemes\n", " for char in skipped_characters:\n", " result[\"details\"].append({\n", " \"phoneme\": \"\", # No phoneme\n", " \"word_segment\": char,\n", " \"label\": 1 # Assuming label for silent graphemes is 1\n", " })\n", "\n", " # Add the phoneme and matched spelling to the result\n", " result[\"details\"].append({\n", " \"phoneme\": phoneme_tup[0],\n", " \"word_segment\": matched_spelling,\n", " \"label\": phoneme_tup[1] # Assuming `phoneme_tup[1]` is the label\n", " })\n", "\n", " # Update the remaining word by removing the matched spelling\n", " if matched_spelling:\n", " remaining_word = remaining_word[len(matched_spelling):]\n", "\n", " if remaining_word: # WORKING: if possible_spellings are not exhaustive, will consider the rest a silient grapheme\n", " result[\"details\"].append({\n", " \"phoneme\": \"\", # No phoneme\n", " \"word_segment\": remaining_word,\n", " \"label\": 1 \n", " })\n", " print(f\"Unmapped segment of the word remains: '{remaining_word}'\")\n", "\n", " return result\n", " \n", " def generate_labels(self, display=True):\n", " results = []\n", " labels = self.evaluate_full_pronunciation()\n", " for label, word in zip(labels, self.transcript.split()):\n", " results.append(self.map_phonemes_to_segments(label, word))\n", "\n", " if display:\n", " self.display_ipa_phonemes_with_labels_and_segments(results, self.transcript)\n", " return results \n", " \n", " def generate_labels_for_api(self):\n", " results = []\n", " labels = self.evaluate_full_pronunciation()\n", " for label, word in zip(labels, self.transcript.split()):\n", " results.append(self.map_phonemes_to_segments_for_api(label, word))\n", "\n", " return results \n", " \n", " def handle_label_shenanigans(self, labels):\n", " \"\"\"\n", " Handle label shenanigans manually.\n", " - if θ is the last phoneme in a word, and it's labelled 0, change it to 1\n", " \"\"\"\n", " for word in labels:\n", " if word[-1][0] == \"θ\" and word[-1][1] == 0:\n", " word[-1] = (\"θ\", 1)\n", " return labels\n", " \n", " def display_ipa_phonemes_with_labels_and_segments(self, data, words):\n", " \"\"\"\n", " Display phonemes and their corresponding segments with labels.\n", " Incorrect phonemes and segments are displayed in red.\n", "\n", " Parameters:\n", " data (list of lists): Each sublist represents a word, and each element is ((phoneme, label), corresponding_segment).\n", " words (list of str): List of corresponding words for the data.\n", " \"\"\"\n", " # Initialize containers for styled phonemes and styled words\n", " styled_phonemes = []\n", " styled_words = []\n", "\n", " for word_data, word in zip(data, words):\n", " # Process phonemes and segments for each word\n", " styled_phoneme_word = []\n", " styled_word = []\n", " for ((phoneme, label), segment) in word_data:\n", " if label == 0:\n", " # Incorrect phoneme or segment\n", " styled_phoneme_word.append(f\"{phoneme}\")\n", " styled_word.append(f\"{segment}\")\n", " else:\n", " # Correct phoneme and segment\n", " styled_phoneme_word.append(f\"{phoneme}\")\n", " styled_word.append(f\"{segment}\")\n", "\n", " # Join phonemes for the current word and add to the phoneme container\n", " styled_phonemes.append(\"\".join(styled_phoneme_word))\n", " styled_words.append(\"\".join(styled_word))\n", " # Combine phonemes and words for display\n", " phoneme_content = \" \".join(styled_phonemes)\n", " word_content = \" \".join(styled_words)\n", "\n", " # Construct complete HTML\n", " html_content = f\"