Qwen
/

Qwen2-Audio-7B-Instruct

@@ -31,34 +31,38 @@ KeyError: 'qwen2-audio'
 ## Quickstart
-In the following, we demonstrate how to use `Qwen2-Audio-7B-Instrucct` for the inference, supporting both voice chat and audio analysis modes. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose.
 ### Voice Chat Inference
 In the voice chat mode, users can freely engage in voice interactions with Qwen2-Audio without text input:
 ```python
 from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
-from transformers.pipelines.audio_utils import ffmpeg_read
-import requests
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
 model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
 conversation = [
-            {"role": "user", "content": [
-                {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"},
-            ]},
-            {"role": "assistant", "content": "Yes, the speaker is female and in her twenties."},
-            {"role": "user", "content": [
-                {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"},
-            ]},
-        ]
 text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
 audios = []
 for message in conversation:
     if isinstance(message["content"], list):
         for ele in message["content"]:
             if ele["type"] == "audio":
-                audios.append(ffmpeg_read(requests.get(ele['audio_url']).content, sampling_rate=processor.feature_extractor.sampling_rate))
 inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
 inputs.input_ids = inputs.input_ids.to("cuda")
@@ -72,36 +76,41 @@ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_
 ### Audio Analysis Inference
 In the audio analysis, users could provide both audio and text instructions for analysis:
 ```python
 from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
-from transformers.pipelines.audio_utils import ffmpeg_read
-import requests
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
 model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
 conversation = [
-            {'role': 'system', 'content': 'You are a helpful assistant.'},
-            {"role": "user", "content": [
-                {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"},
-                {"type": "text", "text": "What's that sound?"},
-            ]},
-            {"role": "assistant", "content": "It is the sound of glass shattering."},
-            {"role": "user", "content": [
-                {"type": "text", "text": "What can you do when you hear that?"},
-            ]},
-            {"role": "assistant", "content": "Stay alert and cautious, and check if anyone is hurt or if there is any damage to property."},
-            {"role": "user", "content": [
-                {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac"},
-                {"type": "text", "text": "What does the person say?"},
-            ]},
-        ]
 text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
 audios = []
 for message in conversation:
     if isinstance(message["content"], list):
         for ele in message["content"]:
             if ele["type"] == "audio":
-                audios.append(ffmpeg_read(requests.get(ele['audio_url']).content, sampling_rate=processor.feature_extractor.sampling_rate))
 inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
 inputs.input_ids = inputs.input_ids.to("cuda")
@@ -110,39 +119,37 @@ generate_ids = model.generate(**inputs, max_length=256)
 generate_ids = generate_ids[:, inputs.input_ids.size(1):]
 response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-print("prompt:\n", text)
-print("response:\n", response)
 ```
 ### Batch Inference
 We also support batch inference:
 ```python
 from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
-from transformers.pipelines.audio_utils import ffmpeg_read
-import requests
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
 model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
 conversation1 = [
-            {"role": "user", "content": [
-                {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"},
-                {"type": "text", "text": "What's that sound?"},
-            ]},
-            {"role": "assistant", "content": "It is the sound of glass shattering."},
-            {"role": "user", "content": [
-                {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav"},
-                {"type": "text", "text": "What can you hear?"},
-            ]}
-        ]
 conversation2 = [
-            {"role": "user", "content": [
-                {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac"},
-                {"type": "text", "text": "What does the person say?"},
-            ]},
-        ]
 conversations = [conversation1, conversation2]
@@ -154,7 +161,11 @@ for conversation in conversations:
         if isinstance(message["content"], list):
             for ele in message["content"]:
                 if ele["type"] == "audio":
-                    audios.append(ffmpeg_read(requests.get(ele['audio_url']).content, sampling_rate=processor.feature_extractor.sampling_rate))
 inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
 inputs['input_ids'] = inputs['input_ids'].to("cuda")

 ## Quickstart
+In the following, we demonstrate how to use `Qwen2-Audio-7B-Instruct` for the inference, supporting both voice chat and audio analysis modes. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose.
 ### Voice Chat Inference
 In the voice chat mode, users can freely engage in voice interactions with Qwen2-Audio without text input:
 ```python
+from io import BytesIO
+from urllib.request import urlopen
+import librosa
 from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
 model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
 conversation = [
+    {"role": "user", "content": [
+        {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"},
+    ]},
+    {"role": "assistant", "content": "Yes, the speaker is female and in her twenties."},
+    {"role": "user", "content": [
+        {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"},
+    ]},
+]
 text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
 audios = []
 for message in conversation:
     if isinstance(message["content"], list):
         for ele in message["content"]:
             if ele["type"] == "audio":
+                audios.append(librosa.load(
+                    BytesIO(urlopen(ele['audio_url']).read()),
+                    sr=processor.feature_extractor.sampling_rate)[0]
+                )
 inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
 inputs.input_ids = inputs.input_ids.to("cuda")
 ### Audio Analysis Inference
 In the audio analysis, users could provide both audio and text instructions for analysis:
 ```python
+from io import BytesIO
+from urllib.request import urlopen
+import librosa
 from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
 model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
 conversation = [
+    {'role': 'system', 'content': 'You are a helpful assistant.'},
+    {"role": "user", "content": [
+        {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"},
+        {"type": "text", "text": "What's that sound?"},
+    ]},
+    {"role": "assistant", "content": "It is the sound of glass shattering."},
+    {"role": "user", "content": [
+        {"type": "text", "text": "What can you do when you hear that?"},
+    ]},
+    {"role": "assistant", "content": "Stay alert and cautious, and check if anyone is hurt or if there is any damage to property."},
+    {"role": "user", "content": [
+        {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac"},
+        {"type": "text", "text": "What does the person say?"},
+    ]},
+]
 text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
 audios = []
 for message in conversation:
     if isinstance(message["content"], list):
         for ele in message["content"]:
             if ele["type"] == "audio":
+                audios.append(
+                    librosa.load(
+                        BytesIO(urlopen(ele['audio_url']).read()),
+                        sr=processor.feature_extractor.sampling_rate)[0]
+                )
 inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
 inputs.input_ids = inputs.input_ids.to("cuda")
 generate_ids = generate_ids[:, inputs.input_ids.size(1):]
 response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 ```
 ### Batch Inference
 We also support batch inference:
 ```python
+from io import BytesIO
+from urllib.request import urlopen
+import librosa
 from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
 processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
 model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
 conversation1 = [
+    {"role": "user", "content": [
+        {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"},
+        {"type": "text", "text": "What's that sound?"},
+    ]},
+    {"role": "assistant", "content": "It is the sound of glass shattering."},
+    {"role": "user", "content": [
+        {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav"},
+        {"type": "text", "text": "What can you hear?"},
+    ]}
+]
 conversation2 = [
+    {"role": "user", "content": [
+        {"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac"},
+        {"type": "text", "text": "What does the person say?"},
+    ]},
+]
 conversations = [conversation1, conversation2]
         if isinstance(message["content"], list):
             for ele in message["content"]:
                 if ele["type"] == "audio":
+                    audios.append(
+                        librosa.load(
+                            BytesIO(urlopen(ele['audio_url']).read()),
+                            sr=processor.feature_extractor.sampling_rate)[0]
+                    )
 inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
 inputs['input_ids'] = inputs['input_ids'].to("cuda")