Update README.md
Browse files
README.md
CHANGED
@@ -31,34 +31,38 @@ KeyError: 'qwen2-audio'
|
|
31 |
|
32 |
## Quickstart
|
33 |
|
34 |
-
In the following, we demonstrate how to use `Qwen2-Audio-7B-
|
35 |
|
36 |
### Voice Chat Inference
|
37 |
In the voice chat mode, users can freely engage in voice interactions with Qwen2-Audio without text input:
|
38 |
```python
|
|
|
|
|
|
|
39 |
from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
|
40 |
-
from transformers.pipelines.audio_utils import ffmpeg_read
|
41 |
-
import requests
|
42 |
|
43 |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
|
44 |
model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
|
45 |
|
46 |
conversation = [
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
|
56 |
audios = []
|
57 |
for message in conversation:
|
58 |
if isinstance(message["content"], list):
|
59 |
for ele in message["content"]:
|
60 |
if ele["type"] == "audio":
|
61 |
-
audios.append(
|
|
|
|
|
|
|
62 |
|
63 |
inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
|
64 |
inputs.input_ids = inputs.input_ids.to("cuda")
|
@@ -72,36 +76,41 @@ response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_
|
|
72 |
### Audio Analysis Inference
|
73 |
In the audio analysis, users could provide both audio and text instructions for analysis:
|
74 |
```python
|
|
|
|
|
|
|
75 |
from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
|
76 |
-
from transformers.pipelines.audio_utils import ffmpeg_read
|
77 |
-
import requests
|
78 |
|
79 |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
|
80 |
model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
|
81 |
|
82 |
conversation = [
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
|
99 |
audios = []
|
100 |
for message in conversation:
|
101 |
if isinstance(message["content"], list):
|
102 |
for ele in message["content"]:
|
103 |
if ele["type"] == "audio":
|
104 |
-
audios.append(
|
|
|
|
|
|
|
|
|
105 |
|
106 |
inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
|
107 |
inputs.input_ids = inputs.input_ids.to("cuda")
|
@@ -110,39 +119,37 @@ generate_ids = model.generate(**inputs, max_length=256)
|
|
110 |
generate_ids = generate_ids[:, inputs.input_ids.size(1):]
|
111 |
|
112 |
response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
113 |
-
|
114 |
-
print("prompt:\n", text)
|
115 |
-
print("response:\n", response)
|
116 |
```
|
117 |
|
118 |
### Batch Inference
|
119 |
We also support batch inference:
|
120 |
```python
|
|
|
|
|
|
|
121 |
from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
|
122 |
-
from transformers.pipelines.audio_utils import ffmpeg_read
|
123 |
-
import requests
|
124 |
|
125 |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
|
126 |
model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
|
127 |
|
128 |
conversation1 = [
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
|
140 |
conversation2 = [
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
|
147 |
conversations = [conversation1, conversation2]
|
148 |
|
@@ -154,7 +161,11 @@ for conversation in conversations:
|
|
154 |
if isinstance(message["content"], list):
|
155 |
for ele in message["content"]:
|
156 |
if ele["type"] == "audio":
|
157 |
-
audios.append(
|
|
|
|
|
|
|
|
|
158 |
|
159 |
inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
|
160 |
inputs['input_ids'] = inputs['input_ids'].to("cuda")
|
|
|
31 |
|
32 |
## Quickstart
|
33 |
|
34 |
+
In the following, we demonstrate how to use `Qwen2-Audio-7B-Instruct` for the inference, supporting both voice chat and audio analysis modes. Note that we have used the ChatML format for dialog, in this demo we show how to leverage `apply_chat_template` for this purpose.
|
35 |
|
36 |
### Voice Chat Inference
|
37 |
In the voice chat mode, users can freely engage in voice interactions with Qwen2-Audio without text input:
|
38 |
```python
|
39 |
+
from io import BytesIO
|
40 |
+
from urllib.request import urlopen
|
41 |
+
import librosa
|
42 |
from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
|
|
|
|
|
43 |
|
44 |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
|
45 |
model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
|
46 |
|
47 |
conversation = [
|
48 |
+
{"role": "user", "content": [
|
49 |
+
{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"},
|
50 |
+
]},
|
51 |
+
{"role": "assistant", "content": "Yes, the speaker is female and in her twenties."},
|
52 |
+
{"role": "user", "content": [
|
53 |
+
{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"},
|
54 |
+
]},
|
55 |
+
]
|
56 |
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
|
57 |
audios = []
|
58 |
for message in conversation:
|
59 |
if isinstance(message["content"], list):
|
60 |
for ele in message["content"]:
|
61 |
if ele["type"] == "audio":
|
62 |
+
audios.append(librosa.load(
|
63 |
+
BytesIO(urlopen(ele['audio_url']).read()),
|
64 |
+
sr=processor.feature_extractor.sampling_rate)[0]
|
65 |
+
)
|
66 |
|
67 |
inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
|
68 |
inputs.input_ids = inputs.input_ids.to("cuda")
|
|
|
76 |
### Audio Analysis Inference
|
77 |
In the audio analysis, users could provide both audio and text instructions for analysis:
|
78 |
```python
|
79 |
+
from io import BytesIO
|
80 |
+
from urllib.request import urlopen
|
81 |
+
import librosa
|
82 |
from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
|
|
|
|
|
83 |
|
84 |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
|
85 |
model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
|
86 |
|
87 |
conversation = [
|
88 |
+
{'role': 'system', 'content': 'You are a helpful assistant.'},
|
89 |
+
{"role": "user", "content": [
|
90 |
+
{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"},
|
91 |
+
{"type": "text", "text": "What's that sound?"},
|
92 |
+
]},
|
93 |
+
{"role": "assistant", "content": "It is the sound of glass shattering."},
|
94 |
+
{"role": "user", "content": [
|
95 |
+
{"type": "text", "text": "What can you do when you hear that?"},
|
96 |
+
]},
|
97 |
+
{"role": "assistant", "content": "Stay alert and cautious, and check if anyone is hurt or if there is any damage to property."},
|
98 |
+
{"role": "user", "content": [
|
99 |
+
{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac"},
|
100 |
+
{"type": "text", "text": "What does the person say?"},
|
101 |
+
]},
|
102 |
+
]
|
103 |
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
|
104 |
audios = []
|
105 |
for message in conversation:
|
106 |
if isinstance(message["content"], list):
|
107 |
for ele in message["content"]:
|
108 |
if ele["type"] == "audio":
|
109 |
+
audios.append(
|
110 |
+
librosa.load(
|
111 |
+
BytesIO(urlopen(ele['audio_url']).read()),
|
112 |
+
sr=processor.feature_extractor.sampling_rate)[0]
|
113 |
+
)
|
114 |
|
115 |
inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
|
116 |
inputs.input_ids = inputs.input_ids.to("cuda")
|
|
|
119 |
generate_ids = generate_ids[:, inputs.input_ids.size(1):]
|
120 |
|
121 |
response = processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
|
|
|
|
|
|
122 |
```
|
123 |
|
124 |
### Batch Inference
|
125 |
We also support batch inference:
|
126 |
```python
|
127 |
+
from io import BytesIO
|
128 |
+
from urllib.request import urlopen
|
129 |
+
import librosa
|
130 |
from transformers import Qwen2AudioForConditionalGeneration, AutoProcessor
|
|
|
|
|
131 |
|
132 |
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
|
133 |
model = Qwen2AudioForConditionalGeneration.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct", device_map="auto")
|
134 |
|
135 |
conversation1 = [
|
136 |
+
{"role": "user", "content": [
|
137 |
+
{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/glass-breaking-151256.mp3"},
|
138 |
+
{"type": "text", "text": "What's that sound?"},
|
139 |
+
]},
|
140 |
+
{"role": "assistant", "content": "It is the sound of glass shattering."},
|
141 |
+
{"role": "user", "content": [
|
142 |
+
{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/f2641_0_throatclearing.wav"},
|
143 |
+
{"type": "text", "text": "What can you hear?"},
|
144 |
+
]}
|
145 |
+
]
|
146 |
|
147 |
conversation2 = [
|
148 |
+
{"role": "user", "content": [
|
149 |
+
{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/1272-128104-0000.flac"},
|
150 |
+
{"type": "text", "text": "What does the person say?"},
|
151 |
+
]},
|
152 |
+
]
|
153 |
|
154 |
conversations = [conversation1, conversation2]
|
155 |
|
|
|
161 |
if isinstance(message["content"], list):
|
162 |
for ele in message["content"]:
|
163 |
if ele["type"] == "audio":
|
164 |
+
audios.append(
|
165 |
+
librosa.load(
|
166 |
+
BytesIO(urlopen(ele['audio_url']).read()),
|
167 |
+
sr=processor.feature_extractor.sampling_rate)[0]
|
168 |
+
)
|
169 |
|
170 |
inputs = processor(text=text, audios=audios, return_tensors="pt", padding=True)
|
171 |
inputs['input_ids'] = inputs['input_ids'].to("cuda")
|