ahmad4raza's picture
Update app.py
46e71b6
raw
history blame
2.65 kB
from transformers import pipeline, SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain.llms import OpenAI
import requests
import os
import io
from datasets import load_dataset
import torch
import soundfile as sf
import gradio as gr
from PIL import Image
import numpy as np
from dotenv import load_dotenv, find_dotenv
load_dotenv(find_dotenv())
def handwriting_to_text(image):
API_URL = "https://api-inference.huggingface.co/models/microsoft/trocr-base-handwritten"
headers = {"Authorization": "Bearer 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"}
with open(image, "rb") as f:
data = f.read()
response = requests.post(API_URL, headers=headers, data=data)
return response.json()
def generate_story(scenario):
template = """
Consider yourself as the famous poet "William Shakespere";
You can generate a poem in Shakespeare's tone based on a single word, the poem should be no more than 4 lines in length;
CONTEXT: {scenario}
POEM:
"""
prompt = PromptTemplate(template=template, input_variables=["scenario"])
story_llm = LLMChain(llm=OpenAI(model_name="gpt-3.5-turbo", temperature=1), prompt=prompt, verbose=True)
story = story_llm.predict(scenario=scenario)
print(story)
return story
def recite_the_poem(content):
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
inputs = processor(text=content, return_tensors="pt")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embeddings = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
speech = model.generate_speech(inputs["input_ids"], speaker_embeddings, vocoder=vocoder)
sf.write("speech.wav", speech.numpy(), samplerate=16000)
with open("speech.wav", "rb") as audio_file:
audio_data = audio_file.read()
return audio_data
def main_model(image):
image = Image.fromarray(np.uint8(image))
image_path = "temp_image.png"
image.save(image_path)
text = handwriting_to_text(image_path)
poem = generate_story(text)
audio_data = recite_the_poem(poem)
return poem, audio_data
iface = gr.Interface(
fn=main_model,
inputs="image",
outputs=["text", "audio"],
title="Flying Shakespeare",
description="Upload the image generated from the Model:O101-M101/2",
)
if __name__ == "__main__":
iface.launch()