Spaces:
Runtime error
Runtime error
""" | |
this model only supports english since text to speech is an english only model | |
""" | |
from google.cloud import texttospeech | |
import os | |
import openai | |
import gradio as gr | |
from dotenv import load_dotenv | |
import pinecone | |
""" | |
login to gcp | |
""" | |
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "gcp_access_key.json" | |
# Instantiates a client | |
client = texttospeech.TextToSpeechClient() | |
""" | |
Connecting to Open AI API | |
""" | |
load_dotenv() | |
openai.organization = os.getenv("OPENAI_ORG") | |
openai.api_key = os.getenv("OPENAI_API_KEY") | |
EMBEDDING_MODEL = "text-embedding-ada-002" | |
""" | |
Connecting to pincone API and assign index | |
""" | |
index_name = 'economic-forecast' | |
pinecone.init( | |
api_key=os.getenv("Pinecone_KEY"), | |
environment=os.getenv("Pinecone_ENV") | |
) | |
## initial a first message to define GPT's role | |
""" | |
define the text -> speech function | |
""" | |
def text2speech(text): | |
# Set the text input to be synthesized | |
synthesis_input = texttospeech.SynthesisInput(text=text) | |
# Build the voice request, select the language code ("en-US") and the ssml | |
# voice gender ("neutral") | |
voice = texttospeech.VoiceSelectionParams( | |
language_code="en-US", name="en-US-News-K", ssml_gender=texttospeech.SsmlVoiceGender.FEMALE | |
) | |
# Select the type of audio file you want returned | |
audio_config = texttospeech.AudioConfig( | |
audio_encoding=texttospeech.AudioEncoding.MP3 | |
) | |
# Perform the text-to-speech request on the text input with the selected | |
# voice parameters and audio file type | |
response = client.synthesize_speech( | |
input=synthesis_input, voice=voice, audio_config=audio_config | |
) | |
# The response's audio_content is binary. | |
with open("output.mp3", "wb") as out: | |
# Write the response to the output file. | |
out.write(response.audio_content) | |
print('Audio content written to file "output.mp3"') | |
""" | |
define voice -> gpt -> text -> voice workflow | |
""" | |
def transcribe(audio): | |
reset_chat_history() | |
voice_path = get_response(audio) | |
messages = get_response(audio, return_messages=True) | |
chat_text = "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages]) | |
with open(voice_path, 'rb') as f: | |
voice_bytes = f.read() | |
return voice_bytes, chat_text | |
#global messages | |
""" | |
gradio output file doesn't have .wav so rename the file to the correct format | |
""" | |
extension = ".wav" | |
audiofomated = f"{audio}{extension}" | |
os.rename(audio,audiofomated) | |
""" | |
pass the audio file to whisper to transcribe | |
""" | |
audio_file = open(audiofomated, "rb") | |
transcript = openai.Audio.transcribe("whisper-1", audio_file) | |
""" | |
run cosin similarity to find context | |
""" | |
### Input the question and search for the relavent text | |
index = pinecone.Index(index_name) | |
query = openai.Embedding.create(input=transcript["text"], model=EMBEDDING_MODEL)["data"][0]["embedding"] # embed the user query into an embedding vector | |
res = index.query(query, top_k=3, include_metadata=True) # run cosin similarity to search the most relevant embeded content; this is done in pinecone only | |
contexts = [ | |
x['metadata']['text'] for x in res['matches'] | |
] | |
merged_context = "".join(contexts) | |
contextwithQuestion = "Context: " + "\n"+ merged_context + "*End of the context*" + "\n\n" + "Question: " + transcript["text"] | |
""" | |
pass the transcripted text to GPT | |
""" | |
messages = [ | |
{"role": "system", | |
"content": | |
"You are Elvire. Forest oracle dedicated to share her knowledge with accidental strangers.\ | |
"} | |
] | |
messages.append({"role": "user", "content":contextwithQuestion}) ## add user input to the list of message | |
response = openai.ChatCompletion.create( | |
model="gpt-3.5-turbo", | |
messages=messages | |
) ## pass the list of message to GPT | |
messages.append({"role": "assistant", "content":response["choices"][0]["message"]["content"]}) ## add GPT response to the list of message | |
text2speech(response["choices"][0]["message"]["content"]) ## create mp3 voice output | |
voice_path = os.path.abspath("output.mp3") | |
return voice_path, "\n".join([f"{msg['role']}: {msg['content']}" for msg in messages]) | |
output_text = gr.outputs.Textbox(label="Chat Messages") | |
audio_input = gr.inputs.Audio(source="microphone", type="filepath", label="Speak here...") | |
chat_output = gr.outputs.Textbox(label="Chat Messages") | |
audio_output = gr.outputs.Audio(type="bytes", label="Synthesized Voice") | |
gr.Interface(fn=transcribe, | |
inputs=audio_input, | |
outputs=[audio_output, chat_output], | |
live=True, | |
allow_flagging=False).launch() |