Spaces:
Sleeping
Sleeping
| import torch | |
| from transformers import AutoModelForCausalLM, LlamaTokenizer, pipeline as transformers_pipeline | |
| from kokoro import KPipeline | |
| import soundfile as sf | |
| import numpy as np | |
| import gradio as gr | |
| # Initialize the image-to-text pipeline | |
| captionImage = transformers_pipeline("image-to-text", model="Salesforce/blip-image-captioning-large") | |
| # Initialize the text-generation pipeline | |
| device = "cpu" | |
| model_id = "ALLaM-AI/ALLaM-7B-Instruct-preview" | |
| # Load the model | |
| model = AutoModelForCausalLM.from_pretrained( | |
| model_id, | |
| torch_dtype="auto", | |
| trust_remote_code=True, | |
| ) | |
| # Use LlamaTokenizer for compatibility | |
| tokenizer = LlamaTokenizer.from_pretrained(model_id) | |
| # Initialize the text-generation pipeline | |
| generator = transformers_pipeline( | |
| "text-generation", | |
| model=model, | |
| tokenizer=tokenizer, | |
| return_full_text=False, | |
| max_new_tokens=500, | |
| do_sample=False, | |
| ) | |
| # Function to generate caption | |
| def Image_Caption(image): | |
| caption = captionImage(image) | |
| return caption[0]['generated_text'] | |
| # Function to generate a story | |
| def Generate_story(textAbout): | |
| storyAbout = {"role": "user", "content": f'write a long story about {textAbout} that takes 3 min to read'}, | |
| story = generator(storyAbout) | |
| story = story[0]['generated_text'] | |
| story = story.replace('\n', ' ').replace('arafed', ' ') | |
| return story | |
| # Function to generate audio | |
| def Generate_audio(text, voice='bm_lewis', speed=1): | |
| pipeline = KPipeline(lang_code='b') | |
| generator = pipeline(text, voice=voice, speed=speed, split_pattern=r'\n+') | |
| full_audio = [] | |
| for _, _, audio in generator: | |
| full_audio.extend(audio) | |
| full_audio = np.array(full_audio) | |
| return full_audio, 24000 | |
| # Main function to process the image and generate audio | |
| def Mustalhim(image): | |
| caption = Image_Caption(image) | |
| story = Generate_story(caption) | |
| audio = Generate_audio(story) | |
| return audio | |
| # Gradio interface | |
| def gradio_interface(image): | |
| audio_waveform, sampling_rate = Mustalhim(image) | |
| audio_file = "output_audio.wav" | |
| sf.write(audio_file, audio_waveform, sampling_rate) | |
| return audio_file | |
| # Path to the example image | |
| example_image = "Example.PNG" | |
| # Create the Gradio app | |
| app = gr.Interface( | |
| fn=gradio_interface, | |
| inputs=gr.Image(type="pil"), | |
| outputs=gr.Audio(type="filepath"), | |
| title="Mustalhim", | |
| description="Upload an image, and the app will generate a story and convert it to audio.", | |
| examples=[[example_image]] | |
| ) | |
| # Launch the app | |
| app.launch() |