Spaces:

MJobe
/

document-vqa-v2

Running

App Files Files Community

MJobe commited on about 1 month ago

Commit

cfd8768

•

1 Parent(s): 11d5e31

Update main.py

Browse files

Files changed (1) hide show

main.py +43 -0

main.py CHANGED Viewed

@@ -25,6 +25,7 @@ nlp_qa_v2 = pipeline("document-question-answering", model="faisalraza/layoutlm-i
 nlp_qa_v3 = pipeline("question-answering", model="deepset/roberta-base-squad2")
 nlp_classification = pipeline("text-classification", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
 nlp_classification_v2 = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest")
 description = """
 ## Image-based Document QA
@@ -153,6 +154,48 @@ async def test_classify_text(text: str = Form(...)):
     except Exception as e:
         return JSONResponse(content=f"Error classifying text: {str(e)}", status_code=500)
 # Set up CORS middleware
 origins = ["*"]  # or specify your list of allowed origins
 app.add_middleware(

 nlp_qa_v3 = pipeline("question-answering", model="deepset/roberta-base-squad2")
 nlp_classification = pipeline("text-classification", model="distilbert/distilbert-base-uncased-finetuned-sst-2-english")
 nlp_classification_v2 = pipeline("text-classification", model="cardiffnlp/twitter-roberta-base-sentiment-latest")
+nlp_speech_to_text = pipeline("automatic-speech-recognition", model="openai/whisper-base")
 description = """
 ## Image-based Document QA
     except Exception as e:
         return JSONResponse(content=f"Error classifying text: {str(e)}", status_code=500)
+@app.post("/transcribe_and_match/", description="Transcribe audio and match responses to form fields.")
+async def transcribe_and_match(
+    file: UploadFile = File(...),
+    field_data: str = Form(...)
+):
+    """
+    Transcribe audio and match it to form fields.
+    :param file: The uploaded audio file.
+    :param field_data: A JSON string that contains form field information (field names and IDs).
+    """
+    try:
+        # Step 1: Read and transcribe the audio file
+        contents = await file.read()
+        transcription_result = nlp_speech_to_text(contents)
+        transcription_text = transcription_result['text']
+        # Step 2: Parse the field_data (which contains field names/IDs)
+        # Example: [{"field_id": "name_field", "field_label": "Name"}, {"field_id": "email_field", "field_label": "Email"}]
+        import json
+        fields = json.loads(field_data)
+        # Step 3: Find the matching field for the transcription
+        field_matches = {}
+        for field in fields:
+            field_label = field.get("field_label", "").lower()
+            field_id = field.get("field_id", "")
+            # Simple matching: if the transcribed text contains the field label (or something close)
+            if field_label in transcription_text.lower():
+                field_matches[field_id] = transcription_text
+        # Step 4: Return transcription + matched fields
+        return {
+            "transcription": transcription_text,
+            "matched_fields": field_matches
+        }
+    except Exception as e:
+        return JSONResponse(content=f"Error processing audio or matching fields: {str(e)}", status_code=500)
 # Set up CORS middleware
 origins = ["*"]  # or specify your list of allowed origins
 app.add_middleware(