Spaces:

aiola
/

whisper-ner-v1

Running on Zero

App Files Files Community

aiola commited on Oct 2, 2024

Commit

8baa9e5

verified ·

1 Parent(s): bbca453

Update app.py

Browse files

Files changed (1) hide show

app.py +119 -11

app.py CHANGED Viewed

@@ -3,6 +3,7 @@ from transformers import WhisperProcessor, WhisperForConditionalGeneration
 import torch
 import torchaudio
 import spaces
 # Initialize devices
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
@@ -12,13 +13,78 @@ processor = WhisperProcessor.from_pretrained("aiola/whisper-ner-v1")
 model = WhisperForConditionalGeneration.from_pretrained("aiola/whisper-ner-v1")
 model = model.to(device)
 def unify_ner_text(text, symbols_to_replace=("/", " ", ":", "_")):
     """Process and standardize entity text by replacing certain symbols and normalizing spaces."""
-    text = " ".join(text.split())
     for symbol in symbols_to_replace:
-        text = text.replace(symbol, "-")
     return text.lower()
 @spaces.GPU  # This decorator ensures your function can use GPU on Hugging Face Spaces
 def transcribe_and_recognize_entities(audio_file, prompt):
     target_sample_rate = 16000
@@ -48,14 +114,56 @@ def transcribe_and_recognize_entities(audio_file, prompt):
     )
     transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-    return transcription
-iface = gr.Interface(
-    fn=transcribe_and_recognize_entities,
-    inputs=[gr.Audio(label="Upload Audio", type="filepath"), gr.Textbox(label="Entity Recognition Prompt")],
-    outputs=gr.Textbox(label="Transcription and Entities"),
-    title="Whisper-NER Demo",
-    description="Upload an audio file and enter entities to identify. The model will transcribe the audio and recognize entities."
-)
-iface.launch(share=True)

 import torch
 import torchaudio
 import spaces
+import re
 # Initialize devices
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model = WhisperForConditionalGeneration.from_pretrained("aiola/whisper-ner-v1")
 model = model.to(device)
+examples = [
+    [
+        "audio/672-122797-0026.wav",
+        "monetary-value, biological-classification, desire, demographic-group, object-category, relationship-role, reflexive-pronoun, furniture-type"
+    ],
+    [
+        "audio/672-122797-0024.wav",
+        "health-warning, importance-indicator, event, sentiment"
+    ],
+    [
+        "audio/672-122797-0027.wav",
+        "action, emotional-resilience, comparative-path-characteristic, social-role"
+    ],
+    [
+        "audio/672-122797-0048.wav",
+        "weapon, emotional-state, household-chore, atmosphere-quality"
+    ],
+    [
+        "audio/7021-85628-0025.wav",
+        "action-goal, person's-title, emotional-connection, personal-qualities, pronoun-target, assignmentaction, physical-action, family-role"
+    ]
+]
 def unify_ner_text(text, symbols_to_replace=("/", " ", ":", "_")):
     """Process and standardize entity text by replacing certain symbols and normalizing spaces."""
+    text = " ".join(text.split())
     for symbol in symbols_to_replace:
+        text = text.replace(symbol, "-")
     return text.lower()
+def extract_entities_and_clean_text_fixed(text):
+    entity_pattern = r"<(.*?)>(.*?)<\1>>"
+    entities = []
+    clean_text = []
+    current_pos = 0
+    # Iterate through the matches for entity tags
+    for match in re.finditer(entity_pattern, text):
+        # Add text before the entity to the clean text
+        clean_text.append(text[current_pos:match.start()])
+        entity_type = match.group(1)
+        entity_text = match.group(2)
+        start_pos = len("".join(clean_text))  # Start position in the clean text
+        end_pos = start_pos + len(entity_text)
+        # Append the entity text to the clean text
+        clean_text.append(entity_text)
+        # Add the entity details to the list
+        entities.append({
+            "entity": entity_type,
+            "text": entity_text,
+            "start": start_pos,
+            "end": end_pos
+        })
+        # Update the current position to the end of the match
+        current_pos = match.end()
+    # Append the remaining part of the text after the last entity
+    clean_text.append(text[current_pos:])
+    # Join all parts of the clean text
+    clean_text_str = "".join(clean_text)
+    return clean_text_str, entities
 @spaces.GPU  # This decorator ensures your function can use GPU on Hugging Face Spaces
 def transcribe_and_recognize_entities(audio_file, prompt):
     target_sample_rate = 16000
     )
     transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
+    clean_text_fixed, extracted_entities_fixed = extract_entities_and_clean_text_fixed(transcription)
+    return transcription, {"text": clean_text_fixed, "entities": extracted_entities_fixed}
+with gr.Blocks(title="WhisperNER v1") as demo:
+    gr.Markdown(
+        """
+        # Whisper-NER: ASR with zero-shot NER
+        WhisperNER is a unified model for automatic speech recognition (ASR) and named entity recognition (NER), with zero-shot capabilities.
+        ## Links
+        * Paper: Paper: [WhisperNER: Unified Open Named Entity and Speech Recognition](https://arxiv.org/abs/2409.08107).
+        * Model: https://huggingface.co/aiola/whisper-ner-v1
+        * Code: https://github.com/aiola-lab/whisper-ner
+        """
+    )
+    with gr.Row() as row1:
+        with gr.Column() as col1:
+            audio_input = gr.Audio(label="Audio Example", type="filepath")
+        with gr.Column() as col2:
+            label_input = gr.Textbox(label="Entity Labels")
+    gr.Markdown("## Output")
+    with gr.Row() as row3:
+        transcript_output = gr.Textbox(label="Transcription and Entities")
+    with gr.Row() as row4:
+        highlighted_text_output = gr.HighlightedText(label="Predicted Highlighted Entities")
+    submit_btn = gr.Button("Submit")
+    examples = gr.Examples(
+        examples,
+        fn=transcribe_and_recognize_entities,
+        inputs=[audio_input, label_input],
+        outputs=[transcript_output, highlighted_text_output],
+        cache_examples=True,
+        run_on_click=True,
+    )
+    # Submitting
+    label_input.submit(
+        fn=transcribe_and_recognize_entities,
+        inputs=[audio_input, label_input],
+        outputs=[transcript_output, highlighted_text_output],
+    )
+    demo.launch()