Spaces:

ahmad4raza
/

Flying-Shakespeare

Sleeping

File size: 2,616 Bytes

from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import OpenAI
import requests
import os
import io
from datasets import load_dataset
import torch
import soundfile as sf
import gradio as gr
from PIL import Image
import numpy as np

from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())

def handwriting_to_text(image):
    API_URL = "https://api-inference.huggingface.co/models/microsoft/trocr-base-handwritten"
    headers = {"Authorization": "Bearer "}
    with open(image, "rb") as f:
        data = f.read()
    response = requests.post(API_URL, headers=headers, data=data)
    return response.json()

def generate_story(scenario):
    template = """
    Consider yourself as the famous poet "William Shakespere";    
    You can generate a poem in Shakespeare's tone based on a single word, the poem should be no more than 4 lines in length;

    CONTEXT: {scenario}
    POEM:
    """

    prompt = PromptTemplate(template=template, input_variables=["scenario"])

    story_llm = LLMChain(llm=OpenAI(model_name="gpt-3.5-turbo", temperature=1), prompt=prompt, verbose=True)
    story = story_llm.predict(scenario=scenario)

    print(story)
    return story

def recite_the_poem(content):
    processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
    model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
    vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")

    inputs = processor(text=content, return_tensors="pt")

    embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
    speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)

    speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)

    sf.write("speech.wav", speech.numpy(), samplerate=16000)

    with open("speech.wav", "rb") as audio_file:
        audio_data = audio_file.read()
    
    return audio_data

def main_model(image):
    image = Image.fromarray(np.uint8(image))
    image_path = "temp_image.png"
    image.save(image_path)
    text = handwriting_to_text(image_path)
    poem = generate_story(text)
    audio_data = recite_the_poem(poem)
    return poem, audio_data

iface = gr.Interface(
    fn=main_model,
    inputs="image",
    outputs=["text", "audio"],
    title="Flying Shakespeare",
    description="Upload the image generated from the Model:O101-M101/2",
)

if __name__ == "__main__":
    iface.launch()