Spaces:

MJobe
/

document-vqa-v2

Sleeping

MJobe commited on Oct 22, 2024

Commit

58b3b85

•

1 Parent(s): 1f23076

Update main.py

Files changed (1) hide show

main.py CHANGED Viewed

@@ -168,7 +168,11 @@ async def transcribe_and_match(
         contents = await file.read()
         audio = AudioSegment.from_file(BytesIO(contents))
-        # Step 2: Export to WAV format and load with torchaudio
         wav_buffer = BytesIO()
         audio.export(wav_buffer, format="wav")
         wav_buffer.seek(0)
@@ -179,14 +183,14 @@ async def transcribe_and_match(
         # Convert waveform to float32
         samples = waveform.numpy().astype(np.float32)
-        # Step 3: Use the speech-to-text model
         transcription_result = nlp_speech_to_text(samples)
         transcription_text = transcription_result['text']
-        # Step 4: Parse the field_data (which contains field names/IDs)
         fields = json.loads(field_data)
-        # Step 5: Find the matching field for the transcription
         field_matches = {}
         for field in fields:
             field_label = field.get("field_label", "").lower()
@@ -196,7 +200,7 @@ async def transcribe_and_match(
             if field_label in transcription_text.lower():
                 field_matches[field_id] = transcription_text
-        # Step 6: Return transcription + matched fields
         return {
             "transcription": transcription_text,
             "matched_fields": field_matches

         contents = await file.read()
         audio = AudioSegment.from_file(BytesIO(contents))
+        # Step 2: Ensure audio is mono
+        if audio.channels > 1:
+            audio = audio.set_channels(1)  # Convert to mono
+        # Step 3: Export to WAV format and load with torchaudio
         wav_buffer = BytesIO()
         audio.export(wav_buffer, format="wav")
         wav_buffer.seek(0)
         # Convert waveform to float32
         samples = waveform.numpy().astype(np.float32)
+        # Step 4: Use the speech-to-text model
         transcription_result = nlp_speech_to_text(samples)
         transcription_text = transcription_result['text']
+        # Step 5: Parse the field_data (which contains field names/IDs)
         fields = json.loads(field_data)
+        # Step 6: Find the matching field for the transcription
         field_matches = {}
         for field in fields:
             field_label = field.get("field_label", "").lower()
             if field_label in transcription_text.lower():
                 field_matches[field_id] = transcription_text
+        # Step 7: Return transcription + matched fields
         return {
             "transcription": transcription_text,
             "matched_fields": field_matches