Spaces:

nslaughter
/

voicenote-2-actionitems

Sleeping

App Files Files Community

Nathan Slaughter commited on Oct 4

Commit

e8ecce6

•

1 Parent(s): 76ed6be

add Qwen2VL for action item inference

Browse files

Files changed (2) hide show

app.py +84 -48
requirements.txt +3 -0

app.py CHANGED Viewed

@@ -1,66 +1,103 @@
 import torch
-from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 import gradio as gr
 import librosa
-# 1. Determine the device
-device = "cuda" if torch.cuda.is_available() else "cpu"
-# 2. Load the processor and model
-processor = AutoProcessor.from_pretrained("openai/whisper-large", language='en')
-model = AutoModelForSpeechSeq2Seq.from_pretrained(
     "openai/whisper-large"
 )
-# 3. Move the model to the device
-model.to(device)
-def transcribe_audio(audio_path: str) -> str:
     try:
-        # Step 1: Load the audio file
-        # librosa.load returns a tuple (audio_data, sampling_rate)
-        audio, sr = librosa.load(audio_path, sr=16000)  # Resample to 16000 Hz
-        # Step 2: Transcribe the audio
-        inputs = processor(audio, sampling_rate=16000, return_tensors="pt", language='en')
-        input_features = inputs.input_features.to(device) #type: ignore
-        # Generate transcription
-        with torch.no_grad(): #type: ignore
-            predicted_ids = model.generate(input_features) #type: ignore
-        # Decode the transcription
-        transcript = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
-        return transcript
     except Exception as e:
-        return f"Error during processing: {str(e)}"
 def extract_action_items(transcript: str) -> str:
-    action_keywords = ["action item", "todo", "task", "follow up", "need to"]
-    sentences = transcript.split('.')
-    action_items = [
-        sentence.strip() + '.'
-        for sentence in sentences
-        if any(keyword in sentence.lower() for keyword in action_keywords)
-    ]
-    return "\n".join(action_items) if action_items else "No action items found."
-def transcribe_and_extract_action_items(audio_path: str) -> tuple[str, str]:
     try:
-        transcript = transcribe_audio(audio_path)
-        # Join action items into a single string, separated by newlines
-        action_items_text = extract_action_items(transcript)
-        return transcript, action_items_text
     except Exception as e:
-        return f"Error during processing: {str(e)}", ""
 # Define the Gradio interface components
 input_audio = gr.Audio(
@@ -96,4 +133,3 @@ interface = gr.Interface(
 # 5. Launch the interface
 if __name__ == "__main__":
     interface.launch()

 import torch
+from transformers import Qwen2VLForConditionalGeneration, AutoProcessor, AutoModelForSpeechSeq2Seq
 import gradio as gr
 import librosa
+# Determine the device
+if torch.cuda.is_available(): # for CUDA
+    device = torch.device("cuda")
+elif torch.backends.mps.is_available(): # for Apple MPS
+    device = torch.device("mps")
+else: # fallback for CPU
+    device = torch.device("cpu")
+# Load the audio processor and model
+stt_processor = AutoProcessor.from_pretrained("openai/whisper-large", language='en')
+stt_model = AutoModelForSpeechSeq2Seq.from_pretrained(
     "openai/whisper-large"
 )
+# Move the model to the device
+stt_model.to(device)
+def transcribe_audio(audio_path: str):
     try:
+        audio, sr = librosa.load(audio_path, sr=16000)
+        inputs = stt_processor(audio, sampling_rate=16000, return_tensors="pt", language='en')
+        input_features = inputs.input_features.to(device)
+        with torch.no_grad():
+            predicted_ids = stt_model.generate(input_features)
+        transcript = stt_processor.batch_decode(predicted_ids, skip_special_tokens=True, language='en')[0]
     except Exception as e:
+        return f"Error during transcription: {str(e)}"
+    finally:
+        return transcript
 def extract_action_items(transcript: str) -> str:
+    """
+    Extracts action items from a transcript using the Llama-3.1-8B-Instruct model.
+    see example code in the model card: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct
+    """
+    model_id = "Qwen/Qwen2-VL-7B-Instruct"
     try:
+        model = Qwen2VLForConditionalGeneration.from_pretrained(
+            model_id,
+            torch_dtype=torch.bfloat16,
+            device_map="auto",
+            # attn_implementation="flash_attention_2"
+        )
+        # default processer
+        processor = AutoProcessor.from_pretrained(model_id)
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {"type": "text", "text": f"""Infer the action items from the following meeting transcript
+                     and list them as a bulleted list in the format:\n- [item short title]: [item description]
+                    The [item short title] should be a short phrase that summarizes the action item.
+                    The [item description] should be a longer description of the action item.
+                    TRANSCRIPT:
+                    {transcript}
+                    """
+                    }
+                ],
+            }
+        ]
+        # Preparation for inference
+        text = processor.apply_chat_template(
+            messages, tokenize=False, add_generation_prompt=True
+        )
+        inputs = processor(
+            text=[text],
+            padding=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(device)
+        # Extract action items
+        generated_ids = model.generate(**inputs, max_new_tokens=128)
+        generated_ids_trimmed = [
+            out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+        ]
+        output_text = processor.batch_decode(
+            generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )
+        return output_text
     except Exception as e:
+        return f"Error during action item extraction: {str(e)}"
+def transcribe_and_extract_action_items(audio_path):
+    transcript = transcribe_audio(audio_path)
+    action_items_text = extract_action_items(transcript)
+    return transcript, action_items_text
+##################################################
+# Gradio Interface
+##################################################
 # Define the Gradio interface components
 input_audio = gr.Audio(
 # 5. Launch the interface
 if __name__ == "__main__":
     interface.launch()

requirements.txt CHANGED Viewed

@@ -3,3 +3,6 @@ pydantic
 openai
 librosa
 langchain

 openai
 librosa
 langchain
+transformers
+bitsandbytes
+accelerate