|
--- |
|
library_name: transformers |
|
pipeline_tag: text-generation |
|
inference: true |
|
widget: |
|
- text: Hello! |
|
example_title: Hello world |
|
group: Python |
|
--- |
|
|
|
This model is for debugging. It is randomly initialized using the config from [Qwen/Qwen2-Audio-7B-Instruct](https://huggingface.co/Qwen/Qwen2-Audio-7B-Instruct) but with smaller size. |
|
|
|
Codes: |
|
```python |
|
import os |
|
from typing import Dict |
|
|
|
import requests |
|
import torch |
|
import transformers |
|
from PIL import Image |
|
from torchvision import io |
|
from transformers import (AutoConfig, AutoModelForCausalLM, AutoProcessor, |
|
AutoTokenizer, GenerationConfig, |
|
Qwen2AudioForConditionalGeneration, pipeline, |
|
set_seed) |
|
|
|
model_id = "Qwen/Qwen2-Audio-7B-Instruct" |
|
repo_id = "yujiepan/qwen2-audio-tiny-random" |
|
save_path = f"/tmp/{repo_id}" |
|
|
|
config = AutoConfig.from_pretrained(model_id, trust_remote_code=True) |
|
config.audio_config.encoder_layers = 2 |
|
config.audio_config.encoder_attention_heads = 2 |
|
config.audio_config.encoder_ffn_dim = 32 |
|
config.audio_config.d_model = 16 |
|
config.text_config.num_hidden_layers = 2 |
|
config.text_config.intermediate_size = 32 |
|
config.text_config.hidden_size = 16 |
|
config.text_config.num_attention_heads = 2 |
|
config.text_config.num_key_value_heads = 1 |
|
|
|
model = Qwen2AudioForConditionalGeneration(config=config) |
|
model = model.to(torch.bfloat16).cuda().eval() |
|
model.generation_config = GenerationConfig.from_pretrained( |
|
model_id, trust_remote_code=True, |
|
) |
|
set_seed(42) |
|
with torch.no_grad(): |
|
for _, p in sorted(model.named_parameters()): |
|
torch.nn.init.uniform_(p, -0.3, 0.3) |
|
|
|
processor = AutoProcessor.from_pretrained(model_id) |
|
model.save_pretrained(save_path) |
|
processor.save_pretrained(save_path) |
|
os.system(f"ls -alh {save_path}") |
|
|
|
|
|
def try_inference(): |
|
from io import BytesIO |
|
from urllib.request import urlopen |
|
|
|
import librosa |
|
processor = AutoProcessor.from_pretrained(save_path) |
|
model = Qwen2AudioForConditionalGeneration.from_pretrained( |
|
save_path, device_map="auto") |
|
conversation = [ |
|
{"role": "user", "content": [ |
|
{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/guess_age_gender.wav"}, |
|
]}, |
|
{"role": "assistant", "content": "Yes, the speaker is female and in her twenties."}, |
|
{"role": "user", "content": [ |
|
{"type": "audio", "audio_url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen2-Audio/audio/translate_to_chinese.wav"}, |
|
]}, |
|
] |
|
text = processor.apply_chat_template( |
|
conversation, add_generation_prompt=True, tokenize=False) |
|
audios = [] |
|
for message in conversation: |
|
if isinstance(message["content"], list): |
|
for ele in message["content"]: |
|
if ele["type"] == "audio": |
|
audios.append(librosa.load( |
|
BytesIO(urlopen(ele['audio_url']).read()), |
|
sr=processor.feature_extractor.sampling_rate)[0] |
|
) |
|
|
|
inputs = processor(text=text, audios=audios, |
|
return_tensors="pt", padding=True) |
|
inputs.input_ids = inputs.input_ids.to("cuda") |
|
|
|
generate_ids = model.generate(**inputs, max_length=256) |
|
generate_ids = generate_ids[:, inputs.input_ids.size(1):] |
|
|
|
response = processor.batch_decode( |
|
generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] |
|
print(response) |
|
|
|
|
|
try_inference() |
|
``` |
|
|