File size: 2,502 Bytes
20faed5
8a3a311
fbb7d6a
8a3a311
 
 
20faed5
2a0d469
 
8a3a311
 
20faed5
8a3a311
20faed5
8a3a311
20faed5
8a3a311
20faed5
 
 
 
 
 
8a3a311
 
 
 
 
 
 
 
 
 
 
20faed5
 
8a3a311
 
 
 
20faed5
8a3a311
 
53372eb
 
 
 
 
20faed5
fbb7d6a
 
 
 
 
 
 
 
 
20faed5
8a3a311
20faed5
8a3a311
 
fbb7d6a
 
20faed5
8a3a311
20faed5
fbb7d6a
 
 
20faed5
 
8a3a311
20faed5
 
8a3a311
20faed5
8a3a311
 
 
fbb7d6a
20faed5
b1e33bb
20faed5
 
 
22b03f1
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
import torch
from transformers import AutoModelForCausalLM, LlamaTokenizer, pipeline as transformers_pipeline
from kokoro import KPipeline
import soundfile as sf
import numpy as np
import gradio as gr



# Initialize the image-to-text pipeline
captionImage = transformers_pipeline("image-to-text", model="Salesforce/blip-image-captioning-large")

# Initialize the text-generation pipeline
device = "cpu"
model_id = "ALLaM-AI/ALLaM-7B-Instruct-preview"

# Load the model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    torch_dtype="auto",
    trust_remote_code=True,
)

# Use LlamaTokenizer for compatibility
tokenizer = LlamaTokenizer.from_pretrained(model_id)

# Initialize the text-generation pipeline
generator = transformers_pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    return_full_text=False,
    max_new_tokens=500,
    do_sample=False,
)

# Function to generate caption
def Image_Caption(image):
    caption = captionImage(image)
    return caption[0]['generated_text']

# Function to generate a story
def Generate_story(textAbout):
  storyAbout =  {"role": "user", "content": f'write a long story about {textAbout} that takes 3 min to read'},    
  story = generator(storyAbout)
  story = story[0]['generated_text']
  story = story.replace('\n', ' ').replace('arafed', ' ')
  return story

# Function to generate audio
def Generate_audio(text, voice='bm_lewis', speed=1):
    pipeline = KPipeline(lang_code='b')
    generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+')
    full_audio = []
    for _, _, audio in generator:
        full_audio.extend(audio)
    full_audio = np.array(full_audio)
    return full_audio, 24000

# Main function to process the image and generate audio
def Mustalhim(image):
    caption = Image_Caption(image)
    story = Generate_story(caption)
    audio = Generate_audio(story)
    return audio

# Gradio interface
def gradio_interface(image):
    audio_waveform, sampling_rate = Mustalhim(image)
    audio_file = "output_audio.wav"
    sf.write(audio_file, audio_waveform, sampling_rate)
    return audio_file

# Path to the example image
example_image = "Example.PNG"

# Create the Gradio app
app = gr.Interface(
    fn=gradio_interface,
    inputs=gr.Image(type="pil"),
    outputs=gr.Audio(type="filepath"),
    title="Mustalhim",
    description="Upload an image, and the app will generate a story and convert it to audio.",
    examples=[[example_image]]
)

# Launch the app
app.launch()