assignment3 / function.py
leoxia711's picture
Update function.py
dd91b48 verified
raw
history blame
3.41 kB
from transformers import pipeline
import torch
from datasets import load_dataset
import soundfile as sf # 用于保存音频文件
from transformers import AutoTokenizer, AutoModelForCausalLM,VitsModel
#tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
#model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B")
# function part
# img2text
def img2text(url):
image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base")
text = image_to_text_model(url)[0]["generated_text"]
for word in ["illustration", "drawing", "painting", "rendering"]:
text = text.replace(word, "").strip()
return text
def text2story(text):
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2")
prompt = (
f"Please write a short, imaginative story based on the following idea (no more than 10 words):\n\n"
f"{text}\n\nStory:"
)
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(
inputs.input_ids,
max_length=200,
do_sample=True,
top_p=0.95,
temperature=0.8,
pad_token_id=tokenizer.eos_token_id
)
story_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
return story_text
# Define the text-to-story function using T5-small
#def text2story(text):
# # Load T5-small model for text-to-text generation
# generator = pipeline ("text-generation", model="pranavpsv/genre-story-generator-v2")
# story_text = generator (text) [0] ['generated_text']
# return story_text
# text2story
#def text2story(text):
# story_text = "abcde" # to be completed
# return story_text
# text2audio
#def text2audio(story_text):
# 加载 TTS 模型
# synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts")
# 加载 speaker embeddings 数据集
# embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
# 选择某个 speaker 的 xvector 作为嵌入向量(可以修改索引来选择其他说话人)
# speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
# 使用生成的文本和 speaker_embedding 生成语音
# speech = synthesiser(story_text, forward_params={"speaker_embeddings": speaker_embedding})
# 保存音频为 wav 文件
# sf.write("story_audio.wav", speech["audio"], samplerate=speech["sampling_rate"])
# 返回音频文件路径(如果需要的话,可以返回 audio 数据)
# return "story_audio.wav"
def text2audio(story_text):
# Load the model and tokenizer
model = VitsModel.from_pretrained("facebook/mms-tts-eng")
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng")
# Tokenize the input text
inputs = tokenizer(text, return_tensors="pt")
# Generate the waveform using the model
with torch.no_grad():
output = model(**inputs).waveform
# Convert the tensor output to numpy for saving
audio_np = output.squeeze().cpu().numpy()
# Save the waveform as a .wav file
output_path = "generated_audio.wav"
sf.write(output_path, audio_np, 22050) # 22050 is the sample rate for the model
# Play the audio file
return Audio(output_path)