from transformers import pipeline import torch from datasets import load_dataset import soundfile as sf # 用于保存音频文件 from transformers import AutoTokenizer, AutoModelForCausalLM,VitsModel #tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B") #model = AutoModelForCausalLM.from_pretrained("deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B") # function part # img2text def img2text(url): image_to_text_model = pipeline("image-to-text", model="Salesforce/blip-image-captioning-base") text = image_to_text_model(url)[0]["generated_text"] for word in ["illustration", "drawing", "painting", "rendering"]: text = text.replace(word, "").strip() return text def text2story(text): tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2") model = AutoModelForCausalLM.from_pretrained("openai-community/gpt2") prompt = ( f"Please write a short, imaginative story based on the following idea (no more than 10 words):\n\n" f"{text}\n\nStory:" ) inputs = tokenizer(prompt, return_tensors="pt") outputs = model.generate( inputs.input_ids, max_length=200, do_sample=True, top_p=0.95, temperature=0.8, pad_token_id=tokenizer.eos_token_id ) story_text = tokenizer.decode(outputs[0], skip_special_tokens=True) return story_text # Define the text-to-story function using T5-small #def text2story(text): # # Load T5-small model for text-to-text generation # generator = pipeline ("text-generation", model="pranavpsv/genre-story-generator-v2") # story_text = generator (text) [0] ['generated_text'] # return story_text # text2story #def text2story(text): # story_text = "abcde" # to be completed # return story_text # text2audio #def text2audio(story_text): # 加载 TTS 模型 # synthesiser = pipeline("text-to-speech", model="microsoft/speecht5_tts") # 加载 speaker embeddings 数据集 # embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") # 选择某个 speaker 的 xvector 作为嵌入向量(可以修改索引来选择其他说话人) # speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) # 使用生成的文本和 speaker_embedding 生成语音 # speech = synthesiser(story_text, forward_params={"speaker_embeddings": speaker_embedding}) # 保存音频为 wav 文件 # sf.write("story_audio.wav", speech["audio"], samplerate=speech["sampling_rate"]) # 返回音频文件路径(如果需要的话,可以返回 audio 数据) # return "story_audio.wav" def text2audio(story_text): # Load the model and tokenizer model = VitsModel.from_pretrained("facebook/mms-tts-eng") tokenizer = AutoTokenizer.from_pretrained("facebook/mms-tts-eng") # Tokenize the input text inputs = tokenizer(text, return_tensors="pt") # Generate the waveform using the model with torch.no_grad(): output = model(**inputs).waveform # Convert the tensor output to numpy for saving audio_np = output.squeeze().cpu().numpy() # Save the waveform as a .wav file output_path = "generated_audio.wav" sf.write(output_path, audio_np, 22050) # 22050 is the sample rate for the model # Play the audio file return Audio(output_path)