Spaces:

bachtom125
/

pronunciation-error-detection

Runtime error

+from fastapi import FastAPI, UploadFile, Form, HTTPException
+from fastapi.responses import JSONResponse
+import uvicorn
+from typing import List
+import torch
+import soundfile as sf
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+import re
+import numpy as np
+import cmudict
+from io import BytesIO
+import os
+import logging
+from joblib import Memory
+from difflib import SequenceMatcher
+import eng_to_ipa as ipa_conv
+import os
+import copy
+from IPython.display import HTML, display
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+from pydub import AudioSegment
+from Bio import pairwise2
+from Bio.pairwise2 import format_alignment
+import asyncio
+from cachetools import TTLCache
+# Set the Numba cache directory to a writable location
+os.environ["NUMBA_CACHE_DIR"] = "/tmp"
+import librosa
+logging.basicConfig(level=logging.INFO)
+# package imports
+from routes.transcribe import router as transcriber_router
+from routes.predict import router as pronunciation_evaluation_router
+# Initialize FastAPI app
+app = FastAPI(title="Talkiee AI", version="1.0.0")
+# health check
+@app.get("/")
+def home():
+    return "Healthy bro!"
+app.include_router(transcriber_router, tags=["transcribe"])
+app.include_router(pronunciation_evaluation_router, tags=["pronunciation_evaluation"])
+if __name__ == '__main__':
+    port = os.environ.get("PORT", 10000)  # Default to 10000 if PORT is not set
+    logging.info(f"Starting server on PORT {port}")
+    uvicorn.run("main:app", host="0.0.0.0", port=int(port), log_level="info")

app/models/__init__.py ADDED Viewed

File without changes

app/models/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (171 Bytes). View file

app/models/__pycache__/ssl_singleton.cpython-39.pyc ADDED Viewed

Binary file (2.04 kB). View file

app/models/__pycache__/transcriber_singleton.cpython-39.pyc ADDED Viewed

Binary file (2.04 kB). View file

app/models/ssl_singleton.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import os
+import torch
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+from utils.general_utils import process_audio
+import asyncio
+import librosa
+from utils.cache import audio_cache
+class SSLSingleton:
+    _instance = None
+    def __new__(cls, model_name="mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme", device=None):
+        if cls._instance is None:
+            cls._instance = super(SSLSingleton, cls).__new__(cls)
+            cls._instance._initialize(model_name, device)
+        return cls._instance
+    def _initialize(self, model_name, device):
+        # Set device (CPU or GPU)
+        # self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.device = "cpu"
+        # Load processor and model
+        print("Loading SSL processor and model...")  # This will only happen once
+        self.processor = Wav2Vec2Processor.from_pretrained(model_name)
+        self.model = Wav2Vec2ForCTC.from_pretrained(model_name)
+        self.model.eval()
+        self.model.to(self.device)  # Move model to the specified device
+    # an infernce function taking in processed audio input and returning the predictions
+    def infer(self, audio_input, device):
+        inputs = self.processor(audio_input, sampling_rate=16000, return_tensors="pt")
+        inputs = inputs.to(self.device)
+        with torch.no_grad():
+            logits = self.model(inputs.input_values).logits
+        predicted_ids = torch.argmax(logits, dim=-1)
+        uttered_phonemes = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+        return uttered_phonemes
+    async def infer_and_save_to_cache(self, file_name, audio_input, device):
+        uttered_phonemes = self.infer(audio_input, device)
+        async with audio_cache.lock:
+            new_cache = audio_cache.cache[file_name]
+            new_cache["uttered_phonemes"] = uttered_phonemes
+            audio_cache.cache[file_name] = new_cache
+        return uttered_phonemes
+ssl_model = SSLSingleton()

app/models/transcriber_singleton.py ADDED Viewed

	@@ -0,0 +1,45 @@

+import os
+import torch
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+from utils.general_utils import process_audio
+import asyncio
+import librosa
+class TranscriberSingleton:
+    _instance = None
+    def __new__(cls, model_name="openai/whisper-tiny.en", device=None):
+        if cls._instance is None:
+            cls._instance = super(TranscriberSingleton, cls).__new__(cls)
+            cls._instance._initialize(model_name, device)
+        return cls._instance
+    def _initialize(self, model_name, device):
+        # Set device (CPU or GPU)
+        # self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        self.device = "cpu"
+        # Load processor and model
+        print(f"Loading Whisper processor and model into {device}...")  # This will only happen once
+        self.processor = AutoProcessor.from_pretrained(model_name)
+        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(model_name)
+        self.model.eval()
+        self.model.to(self.device)  # Move model to the specified device
+    def transcribe_into_English(self, audio_input):
+        # Load audio file
+        audio_input = self.processor(audio_input, sampling_rate=16000, return_tensors="pt", language="en").to(self.device)
+        # Perform transcription
+        with torch.no_grad():
+            generated_ids = self.model.generate(audio_input.input_features)
+        # Decode the transcription
+        transcription = self.processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        return transcription.lower().strip()
+    def transcribe_from_file_path(self, file_path, target_sr=16000):
+        with open(file_path, "rb") as f:
+            audio_input, sr = librosa.load(file_path, sr=target_sr)
+        return self.transcribe_into_English(audio_input)
+transcriber_model = TranscriberSingleton()

app/modules/__init__.py ADDED Viewed

File without changes

app/modules/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (172 Bytes). View file

app/modules/pronunciation_coach/__init__.py ADDED Viewed

File without changes

app/modules/pronunciation_coach/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (192 Bytes). View file

app/modules/pronunciation_coach/__pycache__/pronunciation_assessor.cpython-39.pyc ADDED Viewed

Binary file (22.6 kB). View file

app/modules/pronunciation_coach/__pycache__/pronunciation_assessor_utils.cpython-39.pyc ADDED Viewed

Binary file (2.16 kB). View file

app.py → app/modules/pronunciation_coach/pronunciation_assessor.py RENAMED Viewed

@@ -1,7 +1,3 @@
-from fastapi import FastAPI, UploadFile, Form, HTTPException
-from fastapi.responses import JSONResponse
-import uvicorn
 from typing import List
 import torch
 import soundfile as sf
@@ -10,7 +6,6 @@ import re
 import numpy as np
 import cmudict
 from io import BytesIO
-import os
 import logging
 from joblib import Memory
 from difflib import SequenceMatcher
@@ -24,267 +19,10 @@ from Bio import pairwise2
 from Bio.pairwise2 import format_alignment
 import asyncio
 from cachetools import TTLCache
-# Set the Numba cache directory to a writable location
-os.environ["NUMBA_CACHE_DIR"] = "/tmp"
-import librosa
-logging.basicConfig(level=logging.INFO)
-cmu = cmudict.dict()
-# Initialize FastAPI app
-app = FastAPI()
-# Load the processor and model
-MODEL_NAME = "mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme" # wav2vec based phoneme trascriber trained on L2-ARTIC
-processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
-model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
-model.eval()
-# Check device availability
-# device = "cuda" if torch.cuda.is_available() else "cpu"
-device = 'cpu' # TEMP for testing
-model.to(device)
-whisper_processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
-whisper_model = AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-tiny.en")
-whisper_model.eval()
-whisper_model.to(device)
-# =====================================
-# Section: Utils
-# =====================================
-# Initialize a cache with a 5-minute TTL and 100 items max
-audio_cache = TTLCache(maxsize=100, ttl=300)
-cache_lock = asyncio.Lock()  # To prevent race conditions
-import os
-from tempfile import NamedTemporaryFile
-import subprocess
-async def process_audio(audio, device):
-    """
-    Process an uploaded audio file and prepare input for the model.
-    Converts audio to WAV format using FFmpeg prior to processing.
-    Args:
-        audio: The uploaded audio file.
-        device: The device (e.g., 'cuda' or 'cpu') to move tensors to.
-    Returns:
-        cache_entry: A dictionary containing processed audio and model input.
-    """
-    filename = audio.filename
-    # Check cache for processed audio
-    if filename in audio_cache:
-        logging.info(f"Audio '{filename}' found in cache.")
-        return audio_cache[filename]
-    async with cache_lock:  # Prevent race conditions during cache writes
-        if filename in audio_cache:  # Double-check after acquiring lock
-            logging.info(f"Audio '{filename}' found in cache after lock.")
-            return audio_cache[filename]
-        logging.info(f"Processing audio '{filename}'.")
-        # Read the audio file into a temporary file
-        with NamedTemporaryFile(delete=False, suffix=".m4a") as temp_m4a:
-            temp_m4a_path = temp_m4a.name
-            temp_m4a.write(await audio.read())
-        # Convert M4A to WAV using FFmpeg
-        temp_wav_path = temp_m4a_path.replace(".m4a", ".wav")
-        try:
-            subprocess.run(
-                [
-                    "ffmpeg", "-i", temp_m4a_path,  # Input file
-                    "-ar", "16000",                 # Resample to 16kHz
-                    "-ac", "1",                     # Convert to mono
-                    temp_wav_path                   # Output file
-                ],
-                check=True,
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE
-            )
-        except subprocess.CalledProcessError as e:
-            logging.error(f"FFmpeg conversion failed: {e.stderr.decode()}")
-            raise HTTPException(status_code=500, detail="Failed to process audio file.")
-        finally:
-            os.remove(temp_m4a_path)  # Clean up the temporary M4A file
-        try:
-            # Load the WAV audio for further processing
-            audio_segment = AudioSegment.from_file(temp_wav_path, format="wav")
-            audio_samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
-            max_val = np.iinfo(np.int16).max
-            audio_samples /= max_val
-            if audio_segment.channels > 1:
-                audio_samples = audio_samples.reshape(-1, audio_segment.channels).mean(axis=1)
-            audio_input = librosa.resample(audio_samples, orig_sr=audio_segment.frame_rate, target_sr=16000)
-            input_values = processor(audio_input, return_tensors="pt", sampling_rate=16000).input_values.to(device)
-            # Cache the processed audio
-            cache_entry = {"audio_input": audio_input, "input_values": input_values, "ssl_logits": None}
-            audio_cache[filename] = cache_entry
-            return cache_entry
-        finally:
-            # Clean up the temporary WAV file
-            os.remove(temp_wav_path)
-async def run_ssl_inference(filename, input_values):
-    """
-    Run SSL model inference in the background and store the results in the cache.
-    Args:
-        filename: The name of the audio file.
-        input_values: The processed input tensor for the SSL model.
-    """
-    try:
-        logging.info(f"Running SSL inference for '{filename}' in the background.")
-        with torch.no_grad():
-            ssl_output = model(input_values).logits
-        # Update the cache with the SSL inference result
-        if filename in audio_cache:
-            audio_cache[filename]["ssl_logits"] = ssl_output
-            logging.info(f"SSL inference for '{filename}' completed and cached.")
-    except Exception as e:
-        logging.error(f"Error during SSL inference for '{filename}': {e}")
-def transcribe_into_English(audio_input):
-    # Load audio file
-    audio_input = whisper_processor(audio_input, sampling_rate=16000, return_tensors="pt", language="en").to(device)
-    # Perform transcription
-    with torch.no_grad():
-        generated_ids = whisper_model.generate(audio_input.input_features)
-    # Decode the transcription
-    transcription = whisper_processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-    return transcription.lower().strip()
-def get_nested_position(nested_list, flat_index):
-    """
-    Finds the nested list and the index within it for a given flat index.
-    Args:
-        nested_list (list of lists): The list of lists.
-        flat_index (int): The flattened index.
-    Returns:
-        tuple: (nested_list_index, element_index_in_nested_list)
-    """
-    cumulative_index = 0
-    for list_index, sublist in enumerate(nested_list):
-        # Check if the flat index falls within the current sublist
-        if cumulative_index + len(sublist) > flat_index:
-            # Calculate the index within the sublist
-            element_index = flat_index - cumulative_index
-            return list_index, element_index
-        # Update cumulative index
-        cumulative_index += len(sublist)
-    raise IndexError("Index out of range for the flattened list.")
-def label_specific_elements_in_reference(reference, start_word_idx, start_element_idx, end_word_idx, end_element_idx, label):
-    """
-    Labels elements in a nested list between specified start and end indices (inclusive).
-    Args:
-        reference (list of lists): The original list of lists.
-        start_word_idx (int): Index of the starting nested list.
-        start_element_idx (int): Index of the starting element in the start list.
-        end_word_idx (int): Index of the ending nested list.
-        end_element_idx (int): Index of the ending element in the end list.
-        label: The label to attach to the elements.
-    Returns:
-        list of lists: A new list of lists with labels attached where applicable.
-    """
-    labeled_reference = []
-    for word_idx, sublist in enumerate(reference):
-        labeled_sublist = []
-        for element_idx, element in enumerate(sublist):
-            if start_word_idx < end_word_idx:
-                # Case 1: start_word_idx < end_word_idx
-                if (
-                    (word_idx > start_word_idx and word_idx < end_word_idx) or
-                    (word_idx == start_word_idx and element_idx >= start_element_idx) or
-                    (word_idx == end_word_idx and element_idx <= end_element_idx)
-                ):
-                    # Attach the label to elements within the inclusive range
-                    if isinstance(element, tuple):
-                        print(f"There is already a label at index ({word_idx}, {element_idx})")
-                    labeled_sublist.append((element, label))
-                else:
-                    # Keep elements outside the range unchanged
-                    labeled_sublist.append(element)
-            elif start_word_idx == end_word_idx:
-                # Case 2: start_word_idx == end_word_idx
-                if word_idx == start_word_idx and start_element_idx <= element_idx <= end_element_idx:
-                    # Attach the label to elements within the inclusive range
-                    if isinstance(element, tuple):
-                        print(f"There is already a label at index ({word_idx}, {element_idx})")
-                    labeled_sublist.append((element, label))
-                else:
-                    # Keep elements outside the range unchanged
-                    labeled_sublist.append(element)
-        labeled_reference.append(labeled_sublist)
-    return labeled_reference
-def clean_text(text: str) -> str:
-    """
-    Remove punctuation from the input string except for special characters
-    that are part of a word, such as ' in I'm or - in hard-working.
-    Parameters:
-        text (str): Input string to clean.
-    Returns:
-        str: Cleaned string with allowed special characters retained.
-    """
-    # Allow letters, spaces, apostrophes, and hyphens within words
-    cleaned_text = re.sub(r'[^\w\s\'-]', '', text)  # Remove punctuation except ' and -
-    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Normalize spaces
-    return cleaned_text.lower().strip()
-# =====================================
-# Section: IPA Phonemes Utils
-# =====================================
-# WORKING: converting functions to class, currently done with the last function in the class
-import re
-from difflib import SequenceMatcher
-from IPython.display import HTML, display
-import copy
-from IPython.display import HTML, display
-from Bio import pairwise2
-from Bio.pairwise2 import format_alignment
-# WORKING: converting functions to class, currently done with the last function in the class
-import re
-from difflib import SequenceMatcher
-from IPython.display import HTML, display
-import copy
-from IPython.display import HTML, display
-from Bio import pairwise2
-from Bio.pairwise2 import format_alignment
-import cmudict
 cmu_dict = cmudict.dict()
-class PronunciationAssessment:
     def __init__(self, transcript, uttered_phonemes):
         # NOTE: removed all long signals ('ː') for compatibility with L2-artic's phoneme set (ssl model training set). American English.
         # ground truth phonemes are converted into arpabet first, and then into ipa using the arpabet_to_ipa dict, meaning the arpabet_to_ipa dict contains
@@ -1159,134 +897,4 @@ class PronunciationAssessment:
         # Display
         display(HTML(html_content))
-# health check
-@app.get("/")
-def home():
-    return "Healthy bro!"
-import time # temp
-# taking in both audio and transcript from the user
-@app.post("/predict")
-async def predict(audio: UploadFile, transcript: str = Form(...)):
-    """
-    Predict phoneme labels from uploaded audio and provided transcript.
-    Args:
-        audio (UploadFile): Uploaded audio file (WAV/MP3).
-        transcript (str): Ground truth transcript.
-    Returns:
-        JSONResponse: Contains phoneme labels.
-    """
-    logging.info("Received prediction request!")
-    # Validate file extension
-    allowed_extensions = {"wav", "mp3", "m4a"}
-    filename = audio.filename.lower()
-    start_time = time.time()
-    if not filename.endswith(tuple(allowed_extensions)):
-        raise HTTPException(
-            status_code=400,
-            detail="Invalid file type. Only WAV and MP3 files are supported.",
-        )
-    # Load and preprocess the audio
-    try:
-        cache_entry = await process_audio(audio, device)
-        input_values = cache_entry["input_values"]
-        # Ensure SSL inference is completed
-        logits = cache_entry.get("ssl_logits")
-        if logits is None:
-            logging.info(f"SSL inference not cached for '{filename}', running now.")
-            with torch.no_grad():
-                logits = model(input_values).logits
-                cache_entry["ssl_logits"] = logits
-        end_time = time.time()
-        print(f"Time from call to finish processing audio: {end_time - start_time} seconds")
-        start_time = time.time()
-        transcript = clean_text(transcript).strip()
-        # Decode the phonemes
-        predicted_ids = torch.argmax(logits, dim=-1)
-        uttered_phonemes = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-        end_time = time.time()
-        print("Time taken for inference:", end_time - start_time)
-        start_time = time.time()
-        # init PronunciationAssessment instance
-        cur = PronunciationAssessment(transcript, uttered_phonemes)
-        cur.convert_transcript_into_phonemes()
-        cur.clean_ipa_phonemes()
-        cur.split_phoneme_sequence()
-        print(cur.uttered_ipa_phonemes)
-        # print(cur.segmented_ground_truth_ipa_phonemes)
-        # print(cur.segmented_uttered_ipa_phonemes)
-        # generate the final labels
-        labels = cur.generate_labels_for_api()
-        end_time = time.time()
-        print("Time taken for label generation:", end_time - start_time)
-        return JSONResponse(content={"labels": labels})
-    except Exception as e:
-        logging.error(f"Error during prediction: {e}")
-        raise HTTPException(status_code=500, detail="An error occurred during processing.")
-# taking in audio only and returning the transcript
-@app.post("/transcribe")
-async def transcribe(audio: UploadFile):
-    """
-    Transcribe the uploaded audio and return the transcript.
-    Args:
-        audio (UploadFile): Uploaded audio file (WAV/MP3).
-    Returns:
-        JSONResponse: Contains the transcript.
-    """
-    logging.info("Received transcription request!")
-    # Validate file extension
-    allowed_extensions = {"wav", "mp3", "m4a"}
-    filename = audio.filename.lower()
-    if not filename.endswith(tuple(allowed_extensions)):
-        raise HTTPException(
-            status_code=400,
-            detail="Invalid file type. Only WAV and MP3 files are supported.",
-        )
-    # Load and preprocess the audio
-    try:
-        # Process the audio
-        start_time = time.time()
-        cache_entry = await process_audio(audio, device)
-        audio_input = cache_entry["audio_input"]
-        input_values = cache_entry["input_values"]
-        # Start SSL inference in the background
-        asyncio.create_task(run_ssl_inference(audio.filename, input_values))
-        # Get transcript
-        end_time = time.time()
-        print(f"Time from call to finish processing audio: {end_time - start_time} seconds")
-        transcript = transcribe_into_English(audio_input)
-        transcript = clean_text(transcript).strip()
-        another_end_time = time.time()
-        logging.info(f"Transcript: {transcript}, Time taken from processed audio to finish transcription: {another_end_time - end_time} seconds")
-        return JSONResponse(content={"transcript": transcript})
-    except Exception as e:
-        logging.error(f"Error during transcription: {e}")
-        raise HTTPException(status_code=500, detail="An error occurred during processing.")
-# if __name__ == '__main__':
-#     port = os.environ.get("PORT", 10000)  # Default to 10000 if PORT is not set
-#     logging.info(f"Starting server on PORT {port}")
-#     uvicorn.run("app:app", host="0.0.0.0", port=int(port), log_level="info")

 from typing import List
 import torch
 import soundfile as sf
 import numpy as np
 import cmudict
 from io import BytesIO
 import logging
 from joblib import Memory
 from difflib import SequenceMatcher
 from Bio.pairwise2 import format_alignment
 import asyncio
 from cachetools import TTLCache
+from modules.pronunciation_coach.pronunciation_assessor_utils import *
 cmu_dict = cmudict.dict()
+class PronunciationAssessor:
     def __init__(self, transcript, uttered_phonemes):
         # NOTE: removed all long signals ('ː') for compatibility with L2-artic's phoneme set (ssl model training set). American English.
         # ground truth phonemes are converted into arpabet first, and then into ipa using the arpabet_to_ipa dict, meaning the arpabet_to_ipa dict contains
         # Display
         display(HTML(html_content))

app/modules/pronunciation_coach/pronunciation_assessor_utils.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import numpy as np
+def get_nested_position(nested_list, flat_index):
+    """
+    Finds the nested list and the index within it for a given flat index.
+    Args:
+        nested_list (list of lists): The list of lists.
+        flat_index (int): The flattened index.
+    Returns:
+        tuple: (nested_list_index, element_index_in_nested_list)
+    """
+    cumulative_index = 0
+    for list_index, sublist in enumerate(nested_list):
+        # Check if the flat index falls within the current sublist
+        if cumulative_index + len(sublist) > flat_index:
+            # Calculate the index within the sublist
+            element_index = flat_index - cumulative_index
+            return list_index, element_index
+        # Update cumulative index
+        cumulative_index += len(sublist)
+    raise IndexError("Index out of range for the flattened list.")
+def label_specific_elements_in_reference(reference, start_word_idx, start_element_idx, end_word_idx, end_element_idx, label):
+    """
+    Labels elements in a nested list between specified start and end indices (inclusive).
+    Args:
+        reference (list of lists): The original list of lists.
+        start_word_idx (int): Index of the starting nested list.
+        start_element_idx (int): Index of the starting element in the start list.
+        end_word_idx (int): Index of the ending nested list.
+        end_element_idx (int): Index of the ending element in the end list.
+        label: The label to attach to the elements.
+    Returns:
+        list of lists: A new list of lists with labels attached where applicable.
+    """
+    labeled_reference = []
+    for word_idx, sublist in enumerate(reference):
+        labeled_sublist = []
+        for element_idx, element in enumerate(sublist):
+            if start_word_idx < end_word_idx:
+                # Case 1: start_word_idx < end_word_idx
+                if (
+                    (word_idx > start_word_idx and word_idx < end_word_idx) or
+                    (word_idx == start_word_idx and element_idx >= start_element_idx) or
+                    (word_idx == end_word_idx and element_idx <= end_element_idx)
+                ):
+                    # Attach the label to elements within the inclusive range
+                    if isinstance(element, tuple):
+                        print(f"There is already a label at index ({word_idx}, {element_idx})")
+                    labeled_sublist.append((element, label))
+                else:
+                    # Keep elements outside the range unchanged
+                    labeled_sublist.append(element)
+            elif start_word_idx == end_word_idx:
+                # Case 2: start_word_idx == end_word_idx
+                if word_idx == start_word_idx and start_element_idx <= element_idx <= end_element_idx:
+                    # Attach the label to elements within the inclusive range
+                    if isinstance(element, tuple):
+                        print(f"There is already a label at index ({word_idx}, {element_idx})")
+                    labeled_sublist.append((element, label))
+                else:
+                    # Keep elements outside the range unchanged
+                    labeled_sublist.append(element)
+        labeled_reference.append(labeled_sublist)
+    return labeled_reference

app/routes/__init__.py ADDED Viewed

File without changes

app/routes/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (171 Bytes). View file

app/routes/__pycache__/predict.cpython-39.pyc ADDED Viewed

Binary file (2.19 kB). View file

app/routes/__pycache__/transcribe.cpython-39.pyc ADDED Viewed

Binary file (2.22 kB). View file

app/routes/predict.py ADDED Viewed

	@@ -0,0 +1,58 @@

+from fastapi import FastAPI, UploadFile, Form, HTTPException, APIRouter, Depends
+from fastapi.responses import JSONResponse
+import uvicorn
+from typing import List
+import torch
+import soundfile as sf
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+import re
+import numpy as np
+import cmudict
+from io import BytesIO
+import logging
+from joblib import Memory
+from difflib import SequenceMatcher
+import eng_to_ipa as ipa_conv
+import copy
+from IPython.display import HTML, display
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+from pydub import AudioSegment
+from Bio import pairwise2
+from Bio.pairwise2 import format_alignment
+import asyncio
+from cachetools import TTLCache
+import time
+import os
+from tempfile import NamedTemporaryFile
+import subprocess
+import librosa
+# package imports
+from services.evaluate_pronunciation import PronunciationEvalService
+from utils.general_utils import clean_text
+router = APIRouter()
+@router.post("/predict", summary="Evaluate pronunciation")
+async def evaluate_pronunciation(audio: UploadFile, transcript: str = Form(...)):
+    """
+    Predict phoneme labels from uploaded audio and provided transcript.
+    Args:
+        audio (UploadFile): Uploaded audio file (WAV/MP3).
+        transcript (str): Ground truth transcript.
+    Returns:
+        JSONResponse: Contains phoneme labels.
+    """
+    try:
+        # Call the service to process and transcribe the audio
+        service = PronunciationEvalService(transcript, audio)
+        labels = await service.generate_labels()
+        response = {'labels': labels}
+        return JSONResponse(content=response)
+    except Exception as e:
+        logging.error(f"Error during evaluation: {e}")
+        raise HTTPException(status_code=500, detail="An error occurred during processing.")

app/routes/transcribe.py ADDED Viewed

	@@ -0,0 +1,61 @@

+from fastapi import FastAPI, UploadFile, Form, HTTPException, APIRouter, Depends
+from fastapi.responses import JSONResponse
+import uvicorn
+from typing import List
+import torch
+import soundfile as sf
+from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
+import re
+import numpy as np
+import cmudict
+from io import BytesIO
+import logging
+from joblib import Memory
+from difflib import SequenceMatcher
+import eng_to_ipa as ipa_conv
+import copy
+from IPython.display import HTML, display
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+from pydub import AudioSegment
+from Bio import pairwise2
+from Bio.pairwise2 import format_alignment
+import asyncio
+from cachetools import TTLCache
+import time
+import os
+from tempfile import NamedTemporaryFile
+import subprocess
+import librosa
+# package imports
+from services.transcribe import TranscriptionService
+from utils.general_utils import clean_text
+router = APIRouter()
+service = TranscriptionService()
+@router.post("/transcribe", summary="Trancribe audio into English")
+async def transcribe(audio: UploadFile):
+    """
+    Transcribe the uploaded audio and return the transcript.
+    Args:
+        audio (UploadFile): Uploaded audio file.
+    Returns:
+        JSONResponse: Contains the transcript.
+    """
+    try:
+        # Call the service to process and transcribe the audio
+        transcript = await service.transcribe_audio(audio)
+        transcript = clean_text(transcript).strip()
+        response = {'transcript': transcript}
+        return JSONResponse(content=response)
+    except ValueError as ve:
+        logging.error(f"Validation error: {ve}")
+        raise HTTPException(status_code=400, detail=str(ve))
+    except Exception as e:
+        logging.error(f"Error during transcription: {e}")
+        raise HTTPException(status_code=500, detail="An error occurred during processing.")

app/services/__init__.py ADDED Viewed

File without changes

app/services/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (173 Bytes). View file

app/services/__pycache__/evaluate_pronunciation.cpython-39.pyc ADDED Viewed

Binary file (2.67 kB). View file

app/services/__pycache__/transcribe.cpython-39.pyc ADDED Viewed

Binary file (1.9 kB). View file

app/services/evaluate_pronunciation.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import logging
+import time
+import asyncio
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+from models.ssl_singleton import ssl_model
+from utils.general_utils import process_audio, clean_text
+from modules.pronunciation_coach.pronunciation_assessor import PronunciationAssessor
+from utils.cache import audio_cache
+# process -> call infereence -> structure output -> return
+class PronunciationEvalService:
+    def __init__(self, transcript, audio):
+        """
+        Initialize the transcription service.
+        Args:
+            transcript (str): Ground truth transcript.
+            audio (UploadFile): Uploaded audio file.
+        """
+        self.ssl_model = ssl_model
+        # device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = 'cpu' # TEMP for testing
+        self.transcript = clean_text(transcript).strip()
+        self.audio = audio
+        self.filename = audio.filename
+        self.uttered_phonemes = None
+        self.assessor = None
+    async def get_uttered_phonemes(self):
+        # check if cache has filename
+        audio = self.audio
+        start_time = time.time()
+        audio_inputs = None
+        if await audio_cache.contains(self.filename):
+            async with audio_cache.lock:
+                if audio_cache.cache[self.filename]["uttered_phonemes"] != None:
+                    logging.info(f"Audio '{self.filename}' found in cache.")
+                    end_time  = time.time()
+                    logging.info(f"Time from for getting uttered phonemes: {end_time - start_time} seconds")
+                    return audio_cache.cache[self.filename]["uttered_phonemes"]
+                else:
+                    logging.info(f"Audio '{self.filename}' found in cache but not inferenced. Running inference...")
+                    audio_inputs = audio_cache.cache[self.filename]["audio_input"]
+        else:
+            logging.info(f"Audio '{self.filename}' not found in cache. Running inference...")
+        if audio_inputs is None:
+            cache_entry = await process_audio(audio, self.device)
+            audio_inputs = cache_entry["audio_input"]
+        uttered_phonemes = await self.ssl_model.infer_and_save_to_cache(self.filename, audio_inputs, self.device)
+        end_time  = time.time()
+        logging.info(f"Time for getting uttered phonemes: {end_time - start_time} seconds")
+        return uttered_phonemes
+    async def generate_labels(self):
+        self.uttered_phonemes = await self.get_uttered_phonemes()
+        start_time = time.time()
+        self.assessor = PronunciationAssessor(self.transcript, self.uttered_phonemes)
+        self.assessor.convert_transcript_into_phonemes()
+        self.assessor.clean_ipa_phonemes()
+        self.assessor.split_phoneme_sequence()
+        labels = self.assessor.generate_labels_for_api()
+        end_time = time.time()
+        print("Time taken for label generation after getting uttered phonemes:", end_time - start_time)
+        return labels

app/services/transcribe.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import logging
+import time
+import asyncio
+from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
+from models.transcriber_singleton import transcriber_model
+from models.ssl_singleton import ssl_model
+from utils.general_utils import process_audio, clean_text
+# from utils.transcribe_utils import transcribe_into_English, clean_text
+# process -> call infereence -> structure output -> return
+class TranscriptionService:
+    def __init__(self):
+        """
+        Initialize the transcription service.
+        """
+        self.transcriber_model = transcriber_model
+        # device = "cuda" if torch.cuda.is_available() else "cpu"
+        self.device = 'cpu' # TEMP for testing
+    async def transcribe_audio(self, audio):
+        """
+        Process the uploaded audio file and return its transcription.
+        Args:
+            audio (UploadFile): Uploaded audio file.
+        Returns:
+            str: The transcript.
+        """
+        logging.info("Received transcription request!")
+        try:
+            # Step 1: Process the audio and check cache
+            start_time = time.time()
+            cache_entry = await process_audio(audio, self.device)
+            audio_input = cache_entry["audio_input"]
+            # Step 2: Start SSL inference in the background
+            asyncio.create_task(ssl_model.infer_and_save_to_cache(audio.filename, audio_input, self.device))
+            # Step 3: Get the transcript using Whisper
+            end_time = time.time()
+            logging.info(f"Time from call to finish processing audio: {end_time - start_time} seconds")
+            transcript = self.transcriber_model.transcribe_into_English(audio_input)
+            # Log processing time
+            another_end_time = time.time()
+            logging.info(f"Transcript: {transcript}, Time taken from processed audio to finish transcription: {another_end_time - end_time} seconds")
+            return transcript
+        except Exception as e:
+            logging.error(f"Error during transcription: {e}")
+            raise

notebook-inference.ipynb → app/tester-notebook.ipynb RENAMED Viewed

The diff for this file is too large to render. See raw diff

app/utils/__init__.py ADDED Viewed

File without changes

app/utils/__pycache__/__init__.cpython-39.pyc ADDED Viewed

Binary file (170 Bytes). View file

app/utils/__pycache__/cache.cpython-39.pyc ADDED Viewed

Binary file (2.31 kB). View file

app/utils/__pycache__/general_utils.cpython-39.pyc ADDED Viewed

Binary file (2.42 kB). View file

app/utils/cache.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import asyncio
+from cachetools import TTLCache
+class CacheManager:
+    _instance = None
+    def __new__(cls, *args, **kwargs):
+        if not cls._instance:
+            cls._instance = super(CacheManager, cls).__new__(cls, *args, **kwargs)
+            cls._instance._initialize()
+        return cls._instance
+    def _initialize(self):
+        # Initialize the cache and lock only once
+        self.cache = TTLCache(maxsize=100, ttl=300)
+        self.lock = asyncio.Lock()
+    async def set(self, key, value):
+        async with self.lock:
+            self.cache[key] = value
+    async def get(self, key):
+        async with self.lock:
+            return self.cache.get(key, None)
+    async def contains(self, key):
+        async with self.lock:
+            return key in self.cache
+    async def delete(self, key):
+        async with self.lock:
+            if key in self.cache:
+                del self.cache[key]
+    def set_without_lock(self, key, value):
+        self.cache[key] = value
+    def get_without_lock(self, key):
+        return self.cache.get(key, None)
+    def contains_without_lock(self, key):
+        return key in self.cache
+    def delete_without_lock(self, key):
+        if key in self.cache:
+            del self.cache[key]
+audio_cache = CacheManager()

app/utils/general_utils.py ADDED Viewed

	@@ -0,0 +1,73 @@

+import re
+import logging
+import torch
+from tempfile import NamedTemporaryFile
+import numpy as np
+import librosa
+from pydub import AudioSegment
+import subprocess
+import os
+from fastapi import FastAPI, UploadFile, Form, HTTPException
+from io import BytesIO
+from utils.cache import audio_cache
+import asyncio
+async def process_audio(audio, device):
+    """
+    Process an uploaded audio file and prepare input for the model.
+    Args:
+        audio: The uploaded audio file.
+        device: The device (e.g., 'cuda' or 'cpu') to move tensors to.
+    Returns:
+        cache_entry: A dictionary containing processed audio and model input.
+    """
+    filename = audio.filename
+    # Check cache for processed audio
+    if await audio_cache.contains(filename):
+        logging.info(f"Audio '{filename}' found in cache.")
+        return await audio_cache.get(filename)
+    # Prevent race conditions during cache writes
+    async with audio_cache.lock:
+        # Double-check after acquiring lock
+        if audio_cache.contains_without_lock(filename):
+            logging.info(f"Audio '{filename}' found in cache after lock.")
+            return audio_cache.contains_without_lock(filename)
+        logging.info(f"Processing audio '{filename}'.")
+        # Read and preprocess the audio
+        audio_bytes = BytesIO(await audio.read())
+        audio_segment = AudioSegment.from_file(audio_bytes, format="m4a")
+        audio_samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
+        max_val = np.iinfo(np.int16).max
+        audio_samples /= max_val
+        if audio_segment.channels > 1:
+            audio_samples = audio_samples.reshape(-1, audio_segment.channels).mean(axis=1)
+        audio_input = librosa.resample(audio_samples, orig_sr=audio_segment.frame_rate, target_sr=16000)
+        # input_values = processor(audio_input, return_tensors="pt", sampling_rate=16000).input_values.to(device)
+        # Cache the processed audio
+        cache_entry = {"audio_input": audio_input, "input_values": None, "ssl_logits": None}
+        audio_cache.set_without_lock(filename, cache_entry)
+        return cache_entry
+def clean_text(text: str) -> str:
+    """
+    Remove punctuation from the input string except for special characters
+    that are part of a word, such as ' in I'm or - in hard-working.
+    Parameters:
+        text (str): Input string to clean.
+    Returns:
+        str: Cleaned string with allowed special characters retained.
+    """
+    # Allow letters, spaces, apostrophes, and hyphens within words
+    cleaned_text = re.sub(r'[^\w\s\'-]', '', text)  # Remove punctuation except ' and -
+    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Normalize spaces
+    return cleaned_text.lower().strip()

inference.py DELETED Viewed

@@ -1,214 +0,0 @@
-import torch
-import librosa
-import soundfile as sf
-from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
-import re
-import numpy as np
-import cmudict
-# Load the processor and model
-MODEL_NAME = "mrrubino/wav2vec2-large-xlsr-53-l2-arctic-phoneme" # wav2vec based phoneme trascriber trained on L2-ARTIC
-processor = Wav2Vec2Processor.from_pretrained(MODEL_NAME)
-model = Wav2Vec2ForCTC.from_pretrained(MODEL_NAME)
-model.eval()
-# Check device availability
-device = "cuda" if torch.cuda.is_available() else "cpu"
-model.to(device)
-def load_audio(audio_path, target_sr=16000):
-  """Load an audio file and resample it to 16kHz."""
-  audio, sr = librosa.load(audio_path, sr=target_sr)
-  return audio
-# Original ARPAbet to IPA mapping from SoapBox Labs
-arpabet_to_ipa = {
-    "AA": "a", "AE": "æ", "AH": "ʌ", "AO": "ɔ", "AW": "aʊ", "AY": "aɪ",
-    "EH": "ɛ", "ER": "ɚ", "EY": "eɪ", "IH": "ɪ", "IY": "i", "OW": "oʊ",
-    "OY": "ɔɪ", "UH": "ʊ", "UW": "u", "B": "b", "CH": "t͡ʃ", "D": "d",
-    "DH": "ð", "F": "f", "G": "ɡ", "HH": "h", "JH": "dʒ", "K": "k",
-    "L": "l", "M": "m", "N": "n", "NG": "ŋ", "P": "p", "R": "ɹ",
-    "S": "s", "SH": "ʃ", "T": "t", "TH": "θ", "V": "v", "W": "w",
-    "Y": "j", "Z": "z", "ZH": "ʒ"
-}
-# Invert the dictionary to map IPA to ARPAbet
-ipa_to_arpabet = {v: k for k, v in arpabet_to_ipa.items()}
-def convert_ipa_to_arpabet(ipa_words):
-    """
-    Convert a list of IPA words (strings of concatenated phonemes) to ARPAbet words.
-    :param ipa_words: List of IPA words where each word is a string of concatenated phonemes.
-    :return: List of lists, where each inner list contains ARPAbet phonemes for a word.
-    """
-    arpabet_words = []
-    for word in ipa_words:
-        # Break the word into phonemes
-        phonemes = []  # Collect matched phonemes
-        i = 0
-        while i < len(word):
-            matched = False
-            # Match multi-character IPA phonemes first
-            for ipa_phoneme in sorted(ipa_to_arpabet.keys(), key=len, reverse=True):
-                if word[i:].startswith(ipa_phoneme):
-                    phonemes.append(ipa_to_arpabet[ipa_phoneme])
-                    i += len(ipa_phoneme)
-                    matched = True
-                    break
-            # If no match, add an unknown marker and move forward
-            if not matched:
-                phonemes.append("<UNK>")
-                i += 1
-        # Append the list of phonemes for the word
-        arpabet_words.append(phonemes)
-    return arpabet_words
-def remove_numbers_from_phonemes(phon_list):
-    """
-    Remove all numbers from phonemes in a nested list.
-    Parameters:
-        phon_list (list of lists): Nested list of phonemes.
-    Returns:
-        list of lists: Updated nested list with numbers removed from phonemes.
-    """
-    cleaned_phon_list = []
-    for word_phonemes in phon_list:
-        cleaned_word = [re.sub(r'\d', '', phoneme) for phoneme in word_phonemes]
-        cleaned_phon_list.append(cleaned_word)
-    return cleaned_phon_list
-def align_phoneme_sequences(truth_words, uttered_words, gap_penalty=1, substitution_cost=1):
-    """
-    Align phoneme sequences separated by words.
-    Parameters:
-        truth_words (list of lists): Ground truth phoneme sequences grouped by words.
-        uttered_words (list of lists): Uttered phoneme sequences grouped by words.
-        gap_penalty (int): Penalty for gaps.
-        substitution_cost (int): Cost for substitutions.
-    Returns:
-        alignment (list of tuples): Aligned phoneme sequences with '-' for gaps.
-    """
-    def align_two_sequences(seq1, seq2):
-        """
-        Align two sequences using dynamic programming.
-        """
-        n = len(seq1)
-        m = len(seq2)
-        dp = np.zeros((n + 1, m + 1))
-        # Initialize DP table
-        for i in range(n + 1):
-            dp[i][0] = i * gap_penalty
-        for j in range(m + 1):
-            dp[0][j] = j * gap_penalty
-        # Fill DP table
-        for i in range(1, n + 1):
-            for j in range(1, m + 1):
-                match_cost = 0 if seq1[i - 1] == seq2[j - 1] else substitution_cost
-                dp[i][j] = min(
-                    dp[i - 1][j - 1] + match_cost,  # Match or substitution
-                    dp[i - 1][j] + gap_penalty,    # Deletion
-                    dp[i][j - 1] + gap_penalty     # Insertion
-                )
-        # Traceback to find alignment
-        alignment_seq1 = []
-        alignment_seq2 = []
-        i, j = n, m
-        while i > 0 or j > 0:
-            if i > 0 and j > 0 and dp[i][j] == dp[i - 1][j - 1] + (0 if seq1[i - 1] == seq2[j - 1] else substitution_cost):
-                alignment_seq1.append(seq1[i - 1])
-                alignment_seq2.append(seq2[j - 1])
-                i -= 1
-                j -= 1
-            elif i > 0 and dp[i][j] == dp[i - 1][j] + gap_penalty:
-                alignment_seq1.append(seq1[i - 1])
-                alignment_seq2.append('-')
-                i -= 1
-            else:
-                alignment_seq1.append('-')
-                alignment_seq2.append(seq2[j - 1])
-                j -= 1
-        return alignment_seq1[::-1], alignment_seq2[::-1]
-    # Align each word pair
-    alignment = []
-    for truth_word, uttered_word in zip(truth_words, uttered_words):
-        aligned_truth, aligned_uttered = align_two_sequences(truth_word, uttered_word)
-        alignment.append((aligned_truth, aligned_uttered))
-    return alignment
-def generate_phoneme_labels(data):
-    """
-    Generate phoneme labels for comparison of expected and uttered phonemes.
-    Parameters:
-    data (list of tuples): Each tuple contains (expected phonemes, uttered phonemes).
-    Returns:
-    list of tuples: Each tuple contains (phonemes, labels).
-                    Phonemes are from the expected list, and labels are binary (0: correct, 1: incorrect).
-    """
-    results = []
-    for expected, uttered in data:
-        labels = [
-            0 if exp == utt else 1
-            for exp, utt in zip(expected, uttered)
-        ]
-        results.append((expected, labels))
-    return results
-def convert_words_to_phonemes(words, cmu_dict):
-  phonemes = []
-  for word in words:
-    if word in cmu_dict:
-      phonemes.extend(cmu_dict[word][0])  # Use the first phoneme representation
-    else:
-      phonemes.append('<UNK>')  # Append 'UNK' for unknown words
-  return phonemes
-# RUN
-def predict():
-    cmu = cmudict.dict()
-    # Path to test audio file
-    audio_path = '/content/drive/MyDrive/Test Audio/test5-good.m4a'  # Replace with your audio file path
-    # Define the script
-    transcript = "the person that sat on the floor is punched"
-    # Load audio and normalize
-    audio_input = load_audio(audio_path)
-    input_values = processor(audio_input, return_tensors="pt", sampling_rate=16000).input_values
-    input_values = input_values.to(device)
-    # Step 3: Perform inference
-    with torch.no_grad():
-        logits = model(input_values).logits
-    # Step 4: Decode the phonemes
-    predicted_ids = torch.argmax(logits, dim=-1)
-    uttured_transcript = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-    # convert uttered ipa into SAMPA (for comparison)
-    uttured_phons = convert_ipa_to_arpabet(uttured_transcript.split())
-    # convert ground truth text into SAMPA (for comparison), and remove (ignore) stress markers (may upgrade to evaluate stress also later)
-    trans_phons = [convert_words_to_phonemes([word], cmu) for word in transcript.split()]
-    cleaned_trans_phons = remove_numbers_from_phonemes(trans_phons)
-    # Generate labels
-    alignment = align_phoneme_sequences(cleaned_trans_phons, uttured_phons)
-    phoneme_labels = generate_phoneme_labels(alignment)
-    print(phoneme_labels)
-    return phoneme_labels