Mustalhim_AI / app.py
MoJaff's picture
Update app.py
d955f36 verified
import torch
from transformers import AutoModelForCausalLM, LlamaTokenizer, pipeline as transformers_pipeline
from kokoro import KPipeline
import soundfile as sf
import numpy as np
import gradio as gr
# Initialize the image-to-text pipeline
captionImage = transformers_pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")
# Initialize the text-generation pipeline
device = "cpu"
model_id = "ALLaM-AI/ALLaM-7B-Instruct-preview"
# Load the model
model = AutoModelForCausalLM.from_pretrained(
model_id,
torch_dtype="auto",
trust_remote_code=True,
)
# Use LlamaTokenizer for compatibility
tokenizer = LlamaTokenizer.from_pretrained(model_id)
# Initialize the text-generation pipeline
generator = transformers_pipeline(
"text-generation",
model=model,
tokenizer=tokenizer,
return_full_text=False,
max_new_tokens=500,
do_sample=False,
)
# Function to generate caption
def Image_Caption(image):
caption = captionImage(image)
return caption[0]['generated_text']
# Function to generate a story
def Generate_story(textAbout):
storyAbout = {"role": "user", "content": f'write a long story about {textAbout} that takes 3 min to read'},
story = generator(storyAbout)
story = story[0]['generated_text']
story = story.replace('\n', ' ').replace('arafed', ' ')
return story
# Function to generate audio
def Generate_audio(text, voice='bm_lewis', speed=1):
pipeline = KPipeline(lang_code='b')
generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
full_audio = []
for _, _, audio in generator:
full_audio.extend(audio)
full_audio = np.array(full_audio)
return full_audio, 24000
# Main function to process the image and generate audio
def Mustalhim(image):
caption = Image_Caption(image)
story = Generate_story(caption)
audio = Generate_audio(story)
return audio
# Gradio interface
def gradio_interface(image):
audio_waveform, sampling_rate = Mustalhim(image)
audio_file = "output_audio.wav"
sf.write(audio_file, audio_waveform, sampling_rate)
return audio_file
# Path to the example image
example_image = "Example.PNG"
# Create the Gradio app
app = gr.Interface(
fn=gradio_interface,
inputs=gr.Image(type="pil"),
outputs=gr.Audio(type="filepath"),
title="Mustalhim",
description="Upload an image, and the app will generate a story and convert it to audio.",
examples=[[example_image]]
)
# Launch the app
app.launch()