Spaces:
Runtime error
Runtime error
update
Browse files- app/main.py +9 -3
- app/mfcc.py +1 -2
- app/routers/V1/voice/voice_router.py +26 -8
- app/transcriber.py +2 -4
app/main.py
CHANGED
@@ -11,9 +11,14 @@ from app.routers import routes
|
|
11 |
app = FastAPI(
|
12 |
title="Mother Tongue Voice Matcher",
|
13 |
version="0.0.5",
|
14 |
-
servers=[
|
15 |
-
|
16 |
-
|
|
|
|
|
|
|
|
|
|
|
17 |
root_path="/api/v1",
|
18 |
root_path_in_servers=False,
|
19 |
)
|
@@ -29,6 +34,7 @@ origins = [
|
|
29 |
"http://127.0.0.1:8080",
|
30 |
"http://127.0.0.1:3000",
|
31 |
"http://127.0.0.1:5173",
|
|
|
32 |
]
|
33 |
|
34 |
app.add_middleware(
|
|
|
11 |
app = FastAPI(
|
12 |
title="Mother Tongue Voice Matcher",
|
13 |
version="0.0.5",
|
14 |
+
servers=[
|
15 |
+
{
|
16 |
+
"url": "http://127.0.0.1:8000/api/v1",
|
17 |
+
"description": "Local Server",
|
18 |
+
"url": "https://r3vibe-mother-tongue.hf.space/api/v1",
|
19 |
+
"description": "Huggingface Server",
|
20 |
+
}
|
21 |
+
],
|
22 |
root_path="/api/v1",
|
23 |
root_path_in_servers=False,
|
24 |
)
|
|
|
34 |
"http://127.0.0.1:8080",
|
35 |
"http://127.0.0.1:3000",
|
36 |
"http://127.0.0.1:5173",
|
37 |
+
"https://r3vibe-mother-tongue.hf.space",
|
38 |
]
|
39 |
|
40 |
app.add_middleware(
|
app/mfcc.py
CHANGED
@@ -27,8 +27,7 @@ def calculate_mfcc(audio_data, sample_rate):
|
|
27 |
|
28 |
|
29 |
def calculate_similarity(mfccs1, mfccs2):
|
30 |
-
similarity = cosine_similarity(
|
31 |
-
mfccs1.reshape(1, -1), mfccs2.reshape(1, -1))
|
32 |
return similarity[0][0]
|
33 |
|
34 |
|
|
|
27 |
|
28 |
|
29 |
def calculate_similarity(mfccs1, mfccs2):
|
30 |
+
similarity = cosine_similarity(mfccs1.reshape(1, -1), mfccs2.reshape(1, -1))
|
|
|
31 |
return similarity[0][0]
|
32 |
|
33 |
|
app/routers/V1/voice/voice_router.py
CHANGED
@@ -14,29 +14,47 @@ router = APIRouter(prefix="/voice", tags=["Voice"])
|
|
14 |
|
15 |
@router.post("/transcribe")
|
16 |
async def transcribe_audio(
|
17 |
-
|
|
|
|
|
18 |
):
|
19 |
try:
|
20 |
# Validate file type
|
21 |
-
if not
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
raise HTTPException(
|
23 |
status_code=status.HTTP_400_BAD_REQUEST,
|
24 |
detail="Invalid file type. Please upload a wav file.",
|
25 |
)
|
26 |
|
27 |
# Read file bytes
|
28 |
-
|
29 |
-
|
30 |
|
31 |
# Save the file temporarily
|
32 |
-
with open(
|
33 |
-
buffer.write(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
try:
|
36 |
-
text = get_transcription(
|
37 |
percent = match(matcher_text, text)
|
38 |
if int(percent) > 50:
|
39 |
-
Euclidean, Cosine = mfcc_similarty_check(
|
40 |
return JSONResponse(
|
41 |
{
|
42 |
"transcription": text,
|
|
|
14 |
|
15 |
@router.post("/transcribe")
|
16 |
async def transcribe_audio(
|
17 |
+
original: Annotated[UploadFile, File()],
|
18 |
+
recorded: Annotated[UploadFile, File()],
|
19 |
+
matcher_text: Annotated[str, Body()],
|
20 |
):
|
21 |
try:
|
22 |
# Validate file type
|
23 |
+
if not original.filename.endswith(".wav"):
|
24 |
+
raise HTTPException(
|
25 |
+
status_code=status.HTTP_400_BAD_REQUEST,
|
26 |
+
detail="Invalid file type. Please upload a wav file.",
|
27 |
+
)
|
28 |
+
|
29 |
+
if not recorded.filename.endswith(".wav"):
|
30 |
raise HTTPException(
|
31 |
status_code=status.HTTP_400_BAD_REQUEST,
|
32 |
detail="Invalid file type. Please upload a wav file.",
|
33 |
)
|
34 |
|
35 |
# Read file bytes
|
36 |
+
original_bytes = await original.read()
|
37 |
+
filename_original = f"audio_{int(time.time())}_original.wav"
|
38 |
|
39 |
# Save the file temporarily
|
40 |
+
with open(filename_original, "wb") as buffer:
|
41 |
+
buffer.write(original_bytes)
|
42 |
+
|
43 |
+
|
44 |
+
# Read file bytes
|
45 |
+
recorded_bytes = await recorded.read()
|
46 |
+
filename_recorded = f"audio_{int(time.time())}_recorded.wav"
|
47 |
+
|
48 |
+
# Save the file temporarily
|
49 |
+
with open(filename_recorded, "wb") as buffer:
|
50 |
+
buffer.write(recorded_bytes)
|
51 |
+
|
52 |
|
53 |
try:
|
54 |
+
text = get_transcription(filename_recorded)
|
55 |
percent = match(matcher_text, text)
|
56 |
if int(percent) > 50:
|
57 |
+
Euclidean, Cosine = mfcc_similarty_check(filename_original, filename_recorded)
|
58 |
return JSONResponse(
|
59 |
{
|
60 |
"transcription": text,
|
app/transcriber.py
CHANGED
@@ -9,8 +9,7 @@ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
|
9 |
model_id = "openai/whisper-large-v3"
|
10 |
|
11 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
12 |
-
model_id,
|
13 |
-
torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
|
14 |
)
|
15 |
|
16 |
model.to(device)
|
@@ -30,8 +29,7 @@ pipe = pipeline(
|
|
30 |
device=device,
|
31 |
)
|
32 |
|
33 |
-
dataset = load_dataset(
|
34 |
-
"distil-whisper/librispeech_long", "clean", split="validation")
|
35 |
|
36 |
sample = dataset[0]["audio"]
|
37 |
|
|
|
9 |
model_id = "openai/whisper-large-v3"
|
10 |
|
11 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
12 |
+
model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
|
|
|
13 |
)
|
14 |
|
15 |
model.to(device)
|
|
|
29 |
device=device,
|
30 |
)
|
31 |
|
32 |
+
dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
|
|
|
33 |
|
34 |
sample = dataset[0]["audio"]
|
35 |
|