r3Vibe commited on
Commit
24f6bf4
1 Parent(s): 5d431d1
app/main.py CHANGED
@@ -11,9 +11,14 @@ from app.routers import routes
11
  app = FastAPI(
12
  title="Mother Tongue Voice Matcher",
13
  version="0.0.5",
14
- servers=[{
15
- "url": "http://127.0.0.1:8000/api/v1", "description": "Local Server"
16
- }],
 
 
 
 
 
17
  root_path="/api/v1",
18
  root_path_in_servers=False,
19
  )
@@ -29,6 +34,7 @@ origins = [
29
  "http://127.0.0.1:8080",
30
  "http://127.0.0.1:3000",
31
  "http://127.0.0.1:5173",
 
32
  ]
33
 
34
  app.add_middleware(
 
11
  app = FastAPI(
12
  title="Mother Tongue Voice Matcher",
13
  version="0.0.5",
14
+ servers=[
15
+ {
16
+ "url": "http://127.0.0.1:8000/api/v1",
17
+ "description": "Local Server",
18
+ "url": "https://r3vibe-mother-tongue.hf.space/api/v1",
19
+ "description": "Huggingface Server",
20
+ }
21
+ ],
22
  root_path="/api/v1",
23
  root_path_in_servers=False,
24
  )
 
34
  "http://127.0.0.1:8080",
35
  "http://127.0.0.1:3000",
36
  "http://127.0.0.1:5173",
37
+ "https://r3vibe-mother-tongue.hf.space",
38
  ]
39
 
40
  app.add_middleware(
app/mfcc.py CHANGED
@@ -27,8 +27,7 @@ def calculate_mfcc(audio_data, sample_rate):
27
 
28
 
29
  def calculate_similarity(mfccs1, mfccs2):
30
- similarity = cosine_similarity(
31
- mfccs1.reshape(1, -1), mfccs2.reshape(1, -1))
32
  return similarity[0][0]
33
 
34
 
 
27
 
28
 
29
  def calculate_similarity(mfccs1, mfccs2):
30
+ similarity = cosine_similarity(mfccs1.reshape(1, -1), mfccs2.reshape(1, -1))
 
31
  return similarity[0][0]
32
 
33
 
app/routers/V1/voice/voice_router.py CHANGED
@@ -14,29 +14,47 @@ router = APIRouter(prefix="/voice", tags=["Voice"])
14
 
15
  @router.post("/transcribe")
16
  async def transcribe_audio(
17
- file: Annotated[UploadFile, File()], matcher_text: Annotated[str, Body()]
 
 
18
  ):
19
  try:
20
  # Validate file type
21
- if not file.filename.endswith(".wav"):
 
 
 
 
 
 
22
  raise HTTPException(
23
  status_code=status.HTTP_400_BAD_REQUEST,
24
  detail="Invalid file type. Please upload a wav file.",
25
  )
26
 
27
  # Read file bytes
28
- file_bytes = await file.read()
29
- filename = f"audio_{int(time.time())}.wav"
30
 
31
  # Save the file temporarily
32
- with open(filename, "wb") as buffer:
33
- buffer.write(file_bytes)
 
 
 
 
 
 
 
 
 
 
34
 
35
  try:
36
- text = get_transcription(filename)
37
  percent = match(matcher_text, text)
38
  if int(percent) > 50:
39
- Euclidean, Cosine = mfcc_similarty_check(filename, filename)
40
  return JSONResponse(
41
  {
42
  "transcription": text,
 
14
 
15
  @router.post("/transcribe")
16
  async def transcribe_audio(
17
+ original: Annotated[UploadFile, File()],
18
+ recorded: Annotated[UploadFile, File()],
19
+ matcher_text: Annotated[str, Body()],
20
  ):
21
  try:
22
  # Validate file type
23
+ if not original.filename.endswith(".wav"):
24
+ raise HTTPException(
25
+ status_code=status.HTTP_400_BAD_REQUEST,
26
+ detail="Invalid file type. Please upload a wav file.",
27
+ )
28
+
29
+ if not recorded.filename.endswith(".wav"):
30
  raise HTTPException(
31
  status_code=status.HTTP_400_BAD_REQUEST,
32
  detail="Invalid file type. Please upload a wav file.",
33
  )
34
 
35
  # Read file bytes
36
+ original_bytes = await original.read()
37
+ filename_original = f"audio_{int(time.time())}_original.wav"
38
 
39
  # Save the file temporarily
40
+ with open(filename_original, "wb") as buffer:
41
+ buffer.write(original_bytes)
42
+
43
+
44
+ # Read file bytes
45
+ recorded_bytes = await recorded.read()
46
+ filename_recorded = f"audio_{int(time.time())}_recorded.wav"
47
+
48
+ # Save the file temporarily
49
+ with open(filename_recorded, "wb") as buffer:
50
+ buffer.write(recorded_bytes)
51
+
52
 
53
  try:
54
+ text = get_transcription(filename_recorded)
55
  percent = match(matcher_text, text)
56
  if int(percent) > 50:
57
+ Euclidean, Cosine = mfcc_similarty_check(filename_original, filename_recorded)
58
  return JSONResponse(
59
  {
60
  "transcription": text,
app/transcriber.py CHANGED
@@ -9,8 +9,7 @@ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
9
  model_id = "openai/whisper-large-v3"
10
 
11
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
12
- model_id,
13
- torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
14
  )
15
 
16
  model.to(device)
@@ -30,8 +29,7 @@ pipe = pipeline(
30
  device=device,
31
  )
32
 
33
- dataset = load_dataset(
34
- "distil-whisper/librispeech_long", "clean", split="validation")
35
 
36
  sample = dataset[0]["audio"]
37
 
 
9
  model_id = "openai/whisper-large-v3"
10
 
11
  model = AutoModelForSpeechSeq2Seq.from_pretrained(
12
+ model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
 
13
  )
14
 
15
  model.to(device)
 
29
  device=device,
30
  )
31
 
32
+ dataset = load_dataset("distil-whisper/librispeech_long", "clean", split="validation")
 
33
 
34
  sample = dataset[0]["audio"]
35