Spaces:
Sleeping
Sleeping
import torch | |
from transformers import AutoModelForCausalLM, LlamaTokenizer, pipeline as transformers_pipeline | |
from kokoro import KPipeline | |
import soundfile as sf | |
import numpy as np | |
import gradio as gr | |
# Initialize the image-to-text pipeline | |
captionImage = transformers_pipeline("image-to-text", model="Salesforce/blip-image-captioning-large") | |
# Initialize the text-generation pipeline | |
device = "cpu" | |
model_id = "ALLaM-AI/ALLaM-7B-Instruct-preview" | |
# Load the model | |
model = AutoModelForCausalLM.from_pretrained( | |
model_id, | |
torch_dtype="auto", | |
trust_remote_code=True, | |
) | |
# Use LlamaTokenizer for compatibility | |
tokenizer = LlamaTokenizer.from_pretrained(model_id) | |
# Initialize the text-generation pipeline | |
generator = transformers_pipeline( | |
"text-generation", | |
model=model, | |
tokenizer=tokenizer, | |
return_full_text=False, | |
max_new_tokens=500, | |
do_sample=False, | |
) | |
# Function to generate caption | |
def Image_Caption(image): | |
caption = captionImage(image) | |
return caption[0]['generated_text'] | |
# Function to generate a story | |
def Generate_story(textAbout): | |
storyAbout = {"role": "user", "content": f'write a long story about {textAbout} that takes 3 min to read'}, | |
story = generator(storyAbout) | |
story = story[0]['generated_text'] | |
story = story.replace('\n', ' ').replace('arafed', ' ') | |
return story | |
# Function to generate audio | |
def Generate_audio(text, voice='bm_lewis', speed=1): | |
pipeline = KPipeline(lang_code='b') | |
generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+') | |
full_audio = [] | |
for _, _, audio in generator: | |
full_audio.extend(audio) | |
full_audio = np.array(full_audio) | |
return full_audio, 24000 | |
# Main function to process the image and generate audio | |
def Mustalhim(image): | |
caption = Image_Caption(image) | |
story = Generate_story(caption) | |
audio = Generate_audio(story) | |
return audio | |
# Gradio interface | |
def gradio_interface(image): | |
audio_waveform, sampling_rate = Mustalhim(image) | |
audio_file = "output_audio.wav" | |
sf.write(audio_file, audio_waveform, sampling_rate) | |
return audio_file | |
# Path to the example image | |
example_image = "Example.PNG" | |
# Create the Gradio app | |
app = gr.Interface( | |
fn=gradio_interface, | |
inputs=gr.Image(type="pil"), | |
outputs=gr.Audio(type="filepath"), | |
title="Mustalhim", | |
description="Upload an image, and the app will generate a story and convert it to audio.", | |
examples=[[example_image]] | |
) | |
# Launch the app | |
app.launch() |