Spaces:
Sleeping
Sleeping
from transformers import pipeline | |
import torch | |
from datasets import load_dataset | |
import soundfile as sf # 用于保存音频文件 | |
from transformers import AutoTokenizer, AutoModelForCausalLM,VitsModel | |
#tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B") | |
#model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B") | |
# function part | |
# img2text | |
def img2text(url): | |
image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") | |
text = image_to_text_model(url)[0]["generated_text"] | |
for word in ["illustration", "drawing", "painting", "rendering"]: | |
text = text.replace(word, "").strip() | |
return text | |
def text2story(text): | |
tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") | |
model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2") | |
prompt = ( | |
f"Please write a short, imaginative story based on the following idea (no more than 10 words):\n\n" | |
f"{text}\n\nStory:" | |
) | |
inputs = tokenizer(prompt, return_tensors="pt") | |
outputs = model.generate( | |
inputs.input_ids, | |
max_length=200, | |
do_sample=True, | |
top_p=0.95, | |
temperature=0.8, | |
pad_token_id=tokenizer.eos_token_id | |
) | |
story_text = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
return story_text | |
# Define the text-to-story function using T5-small | |
#def text2story(text): | |
# # Load T5-small model for text-to-text generation | |
# generator = pipeline ("text-generation", model="pranavpsv/genre-story-generator-v2") | |
# story_text = generator (text) [0] ['generated_text'] | |
# return story_text | |
# text2story | |
#def text2story(text): | |
# story_text = "abcde" # to be completed | |
# return story_text | |
# text2audio | |
#def text2audio(story_text): | |
# 加载 TTS 模型 | |
# synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts") | |
# 加载 speaker embeddings 数据集 | |
# embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
# 选择某个 speaker 的 xvector 作为嵌入向量(可以修改索引来选择其他说话人) | |
# speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) | |
# 使用生成的文本和 speaker_embedding 生成语音 | |
# speech = synthesiser(story_text, forward_params={"speaker_embeddings": speaker_embedding}) | |
# 保存音频为 wav 文件 | |
# sf.write("story_audio.wav", speech["audio"], samplerate=speech["sampling_rate"]) | |
# 返回音频文件路径(如果需要的话,可以返回 audio 数据) | |
# return "story_audio.wav" | |
def text2audio(story_text): | |
# Load the model and tokenizer | |
model = VitsModel.from_pretrained("facebook/mms-tts-eng") | |
tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng") | |
# Tokenize the input text | |
inputs = tokenizer(text, return_tensors="pt") | |
# Generate the waveform using the model | |
with torch.no_grad(): | |
output = model(**inputs).waveform | |
# Convert the tensor output to numpy for saving | |
audio_np = output.squeeze().cpu().numpy() | |
# Save the waveform as a .wav file | |
output_path = "generated_audio.wav" | |
sf.write(output_path, audio_np, 22050) # 22050 is the sample rate for the model | |
# Play the audio file | |
return Audio(output_path) | |