Spaces:

mrchan1207
/

phoneme_transciptor

Sleeping

App Files Files Community

mrchan1207 commited on Aug 26

Commit

4a55f3c

verified ·

1 Parent(s): 0a04551

Upload 5 files (#1)

Browse files

Files changed (5) hide show

Dockerfile +29 -0
README.md +3 -4
app.py +110 -0
gitattributes +35 -0
requirements.txt +13 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,29 @@

+# Use an official Python runtime as a parent image
+FROM python:3.10-slim
+RUN mkdir -p /cache && chmod 777 /cache
+ENV HF_HOME="/cache"
+ENV HUGGINGFACE_HUB_CACHE="/cache"
+ENV TRANSFORMERS_CACHE="/cache"
+ENV TRANSFORMERS_CACHE="/cache"
+ENV LIBROSA_CACHE_DISABLE="1"
+# Set the working directory in the container
+WORKDIR /code
+# Copy the requirements file into the container
+COPY ./requirements.txt /code/requirements.txt
+# Install any needed packages specified in requirements.txt
+RUN pip install --no-cache-dir --upgrade -r /code/requirements.txt
+# Copy the app code into the container
+COPY ./app.py /code/app.py
+# Expose the port the app runs on. HF Spaces uses 7860.
+EXPOSE 7860
+# Command to run the app using uvicorn
+# The host must be 0.0.0.0 to be accessible from outside the container
+CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]

README.md CHANGED Viewed

@@ -1,12 +1,11 @@
 ---
-title: Wav2vec2
-emoji: 🚀
 colorFrom: purple
-colorTo: purple
 sdk: docker
 pinned: false
 license: mit
-short_description: Phoneme detection from audio
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Phoneme Transciptor
+emoji: 💻
 colorFrom: purple
+colorTo: yellow
 sdk: docker
 pinned: false
 license: mit
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import torch
+import librosa
+import soundfile as sf
+import io
+from fastapi import FastAPI, File, UploadFile, Request
+from fastapi.responses import JSONResponse
+from transformers import Wav2Vec2ForCTC, Wav2Vec2Processor
+import os
+print("--- SCRIPT START: app.py v3 ---")
+os.environ[ 'NUMBA_CACHE_DIR' ] = '/tmp/'
+os.environ['LIBROSA_CACHE_DISABLE'] = '1'
+print(f"--- LIBROSA_CACHE_DISABLE is set to: {os.environ.get('LIBROSA_CACHE_DISABLE')} ---")
+# --- 1. Initialize FastAPI App ---
+app = FastAPI(
+    title="Audio Transcription API",
+    description="An API to transcribe audio files using a Wav2Vec2 model.",
+)
+# --- 2. Load Model and Processor (with GPU support) ---
+# <-- CHANGED: Detect if a GPU is available, otherwise use CPU
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+print(f"Using device: {device}")
+MODEL_ID = "Bluecast/wav2vec2-Phoneme"
+print(f"Loading model: {MODEL_ID}...")
+try:
+    processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
+    model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
+    # <-- CHANGED: Move the model to the selected device (GPU or CPU)
+    model.to(device)
+    print("Model loaded successfully.")
+except Exception as e:
+    print(f"Error loading model: {e}")
+    model = None
+    processor = None
+# --- 3. Define the Transcription Endpoint ---
+@app.post("/transcribe/")
+async def transcribe(audio_file: UploadFile = File(...)):
+    if not model or not processor:
+        return JSONResponse(status_code=503, content={"error": "Model is not loaded."})
+    try:
+        contents = await audio_file.read()
+        audio_data, original_sr = sf.read(io.BytesIO(contents))
+        if audio_data.ndim > 1:
+            audio_data = audio_data.mean(axis=1)
+        resampled_audio = librosa.resample(y=audio_data, orig_sr=original_sr, target_sr=16000)
+        inputs = processor(resampled_audio, sampling_rate=16000, return_tensors="pt", padding=True)
+        # <-- CHANGED: Move the input tensors to the same device as the model
+        inputs = inputs.to(device)
+        with torch.no_grad():
+            logits = model(**inputs).logits
+        predicted_ids = torch.argmax(logits, dim=-1)
+        transcription = processor.batch_decode(predicted_ids)[0]
+        print(f"Transcription complete: {transcription}")
+        return {"transcription": transcription}
+    except Exception as e:
+        print(f"Error during transcription: {str(e)}")
+        return JSONResponse(status_code=500, content={"error": f"An error occurred: {str(e)}"})
+@app.post("/transcribe_audio/")
+async def transcribe_audio(request: Request):
+    if not model or not processor:
+        return JSONResponse(status_code=503, content={"error": "Model is not loaded."})
+    try:
+        contents = await request.body()
+        audio_data, original_sr = sf.read(io.BytesIO(contents))
+        if audio_data.ndim > 1:
+            audio_data = audio_data.mean(axis=1)
+        resampled_audio = librosa.resample(y=audio_data, orig_sr=original_sr, target_sr=16000)
+        inputs = processor(resampled_audio, sampling_rate=16000, return_tensors="pt", padding=True)
+        # <-- CHANGED: Move the input tensors to the same device as the model
+        inputs = inputs.to(device)
+        with torch.no_grad():
+            logits = model(**inputs).logits
+        predicted_ids = torch.argmax(logits, dim=-1)
+        transcription = processor.batch_decode(predicted_ids)[0]
+        print(f"Transcription complete: {transcription}")
+        return {"transcription": transcription}
+    except Exception as e:
+        print(f"Error during transcription: {str(e)}")
+        return JSONResponse(status_code=500, content={"error": f"An error occurred: {str(e)}"})
+# --- 4. Root Endpoint for Health Check ---
+@app.get("/")
+def read_root():
+    return {"status": "API is running."}

gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+fastapi
+uvicorn
+python-multipart==0.0.20
+soundfile==0.13.1
+librosa==0.10.1
+joblib==1.3.2
+# For Hugging Face and PyTorch
+transformers==4.40.0
+torch==2.2.1 --extra-index-url https://download.pytorch.org/whl/cu121
+datasets==2.19.1
+tokenizers==0.19.1
+numpy==1.26.4