Spaces:
Runtime error
Runtime error
import openai, os | |
import gradio as gr | |
import time | |
import boto3 | |
import json | |
import numpy as np | |
import wave | |
import io | |
import os | |
from langchain.llms import OpenAI | |
from langchain.chains import ConversationChain | |
from langchain.memory import ConversationSummaryBufferMemory | |
from langchain.memory import ConversationBufferWindowMemory | |
from langchain.chat_models import ChatOpenAI | |
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler | |
from langchain.schema import HumanMessage | |
import subprocess | |
from contextlib import closing | |
import asyncio | |
import requests | |
# This example uses aiofile for asynchronous file reads. | |
# It's not a dependency of the project but can be installed | |
# with `pip install aiofile`. | |
import aiofile | |
from amazon_transcribe.client import TranscribeStreamingClient | |
from amazon_transcribe.handlers import TranscriptResultStreamHandler | |
from amazon_transcribe.model import TranscriptEvent | |
from amazon_transcribe.utils import apply_realtime_delay | |
def run_shell_cmd(command): | |
# Run a shell command | |
result = subprocess.run(command, shell=True, capture_output=True, text=True) | |
# Check the command output | |
if result.returncode == 0: | |
print("Command executed successfully") | |
print("Command output:") | |
print(result.stdout) | |
else: | |
print("Command failed") | |
print("Error message:") | |
print(result.stderr) | |
def wav_to_pcm(input_file, output_file): | |
cmd = "ffmpeg -i " + input_file + " -f s16le -ar 16000 -ac 1 -acodec pcm_s16le " + output_file | |
run_shell_cmd(cmd) | |
openai.api_key = os.environ["OPENAI_API_KEY"] | |
did_api_key = os.environ["DID_API_KEY"] | |
avatar_url = "https://create-images-results.d-id.com/DefaultPresenters/Magen_f/image.jpeg" | |
polly = boto3.client('polly', region_name='us-east-1') | |
s3 = boto3.client('s3') | |
transcribe = boto3.client('transcribe') | |
#memory = ConversationSummaryBufferMemory(llm=ChatOpenAI(), max_token_limit=2048) | |
memory = ConversationBufferWindowMemory(k=5) | |
conversation = ConversationChain( | |
llm=OpenAI(streaming=True, callbacks=[StreamingStdOutCallbackHandler()], max_tokens=2048, temperature=0.5), | |
memory=memory, | |
) | |
SAMPLE_RATE = 16000 | |
BYTES_PER_SAMPLE = 2 | |
CHANNEL_NUMS = 1 | |
AUDIO_PATH = '' | |
CHUNK_SIZE = 1024 * 8 | |
REGION = "us-west-2" | |
transcript_text = '' | |
transcriptions = [] | |
class MyEventHandler(TranscriptResultStreamHandler): | |
def __init__(self, transcript_result_stream): | |
super().__init__(transcript_result_stream) | |
self.transcriptions = [] | |
async def handle_transcript_event(self, transcript_event: TranscriptEvent): | |
# This handler can be implemented to handle transcriptions as needed. | |
# Here's an example to get started. | |
results = transcript_event.transcript.results | |
for result in results: | |
for alt in result.alternatives: | |
print(alt.transcript) | |
transcriptions.append(alt.transcript) | |
async def basic_transcribe(): | |
# Setup up our client with our chosen AWS region | |
client = TranscribeStreamingClient(region=REGION) | |
# Start transcription to generate our async stream | |
stream = await client.start_stream_transcription( | |
language_code="zh-CN", | |
media_sample_rate_hz=SAMPLE_RATE, | |
media_encoding="pcm", | |
) | |
async def write_chunks(): | |
# NOTE: For pre-recorded files longer than 5 minutes, the sent audio | |
# chunks should be rate limited to match the realtime bitrate of the | |
# audio stream to avoid signing issues. | |
async with aiofile.AIOFile(AUDIO_PATH, "rb") as afp: | |
reader = aiofile.Reader(afp, chunk_size=CHUNK_SIZE) | |
await apply_realtime_delay( | |
stream, reader, BYTES_PER_SAMPLE, SAMPLE_RATE, CHANNEL_NUMS | |
) | |
await stream.input_stream.end_stream() | |
# Instantiate our handler and start processing events | |
handler = MyEventHandler(stream.output_stream) | |
await asyncio.gather(write_chunks(), handler.handle_events()) | |
# Retrieve the transcriptions from the handler | |
#transcriptions = handler.transcriptions | |
def download_file(bucket_name, object_key, file_path): | |
try: | |
# Download the file from S3 | |
s3.download_file(bucket_name, object_key, file_path) | |
print(f"File downloaded successfully: {file_path}") | |
except Exception as e: | |
print(f"Error downloading file: {str(e)}") | |
def polly_text_to_audio(audio_file_name, text, audio_format): | |
if os.path.exists(audio_file_name): | |
os.remove(audio_file_name) | |
print("output mp3 file deleted successfully.") | |
else: | |
print("output mp3 file does not exist.") | |
polly_response = polly.synthesize_speech( | |
Text=text, | |
OutputFormat=audio_format, | |
SampleRate='16000', | |
VoiceId='Zhiyu', | |
LanguageCode='cmn-CN', | |
Engine='neural', | |
LexiconNames=['tigoCN'] | |
) | |
# Access the audio stream from the response | |
if "AudioStream" in polly_response: | |
# Note: Closing the stream is important because the service throttles on the | |
# number of parallel connections. Here we are using contextlib.closing to | |
# ensure the close method of the stream object will be called automatically | |
# at the end of the with statement's scope. | |
with closing(polly_response["AudioStream"]) as stream: | |
try: | |
# Open a file for writing the output as a binary stream | |
with open(audio_file_name, "wb") as file: | |
file.write(stream.read()) | |
except IOError as error: | |
# Could not write to file, exit gracefully | |
print(error) | |
sys.exit(-1) | |
else: | |
# The response didn't contain audio data, exit gracefully | |
print("Could not stream audio") | |
sys.exit(-1) | |
def play_s3_voice(text): | |
output_file = "/tmp/response.mp3" | |
polly_text_to_audio(output_file, text, "mp3") | |
# Upload the file to an S3 bucket | |
audio_output_bucket_name = "lingo-audio-materials" | |
audio_output_s3_key = "answers/response.mp3" | |
s3.upload_file(output_file, audio_output_bucket_name, audio_output_s3_key) | |
# Construct the S3 bucket URI | |
s3_uri = f"s3://{audio_output_bucket_name}/{audio_output_s3_key}" | |
print("audio output bucket name:"+audio_output_bucket_name) | |
print("audio output key name:"+audio_output_s3_key) | |
mp3_pre_signed_url = s3.generate_presigned_url('get_object',Params={'Bucket': audio_output_bucket_name,'Key': audio_output_s3_key},ExpiresIn=3600) | |
print("mp3_pre_signed_url:"+mp3_pre_signed_url) | |
current_dir = os.getcwd() | |
print("current dir:"+current_dir) | |
print("output_file_location: "+output_file) | |
return output_file, mp3_pre_signed_url | |
def generate_talk_with_audio(input, avatar_url, api_key = did_api_key): | |
url = "https://api.d-id.com/talks" | |
payload = { | |
"script": { | |
"type": "audio", | |
"audio_url": input | |
}, | |
"config": { | |
"auto_match": "true", | |
"result_format": "mp4" | |
}, | |
"source_url": avatar_url | |
} | |
headers = { | |
"accept": "application/json", | |
"content-type": "application/json", | |
"authorization": "Basic " + api_key | |
} | |
response = requests.post(url, json=payload, headers=headers) | |
return response.json() | |
def get_a_talk(id, api_key = os.environ.get('DID_API_KEY')): | |
url = "https://api.d-id.com/talks/" + id | |
headers = { | |
"accept": "application/json", | |
"authorization": "Basic "+api_key | |
} | |
response = requests.get(url, headers=headers) | |
return response.json() | |
def get_mp4_video(input, avatar_url=avatar_url): | |
response = generate_talk_with_audio(input=input, avatar_url=avatar_url) | |
print("DID response: "+str(response)) | |
talk = get_a_talk(response['id']) | |
video_url = "" | |
index = 0 | |
while index < 30: | |
index += 1 | |
if 'result_url' in talk: | |
video_url = talk['result_url'] | |
return video_url | |
else: | |
time.sleep(1) | |
talk = get_a_talk(response['id']) | |
return video_url | |
def predict(input, history=[]): | |
if input is not None: | |
history.append(input) | |
response = conversation.predict(input=input) | |
audio_file, pre_signed_url = play_s3_voice(response) | |
video_url = get_mp4_video(input=pre_signed_url, avatar_url=avatar_url) | |
video_html = f"""<video width="320" height="240" controls autoplay><source src="{video_url}" type="video/mp4"></video>""" | |
history.append(response) | |
responses = [(u,b) for u,b in zip(history[::2], history[1::2])] | |
return responses, audio_file, video_html, history | |
else: | |
video_html = f'<img src="{avatar_url}" width="320" height="240" alt="John Carmack">' | |
responses = [(u,b) for u,b in zip(history[::2], history[1::2])] | |
return responses, audio_file, video_html, history | |
def transcribe_func_new(audio): | |
audio_file = open(audio, "rb") | |
wav_file = audio_file.name | |
print("wav_file: "+wav_file) | |
#transcript = openai.Audio.transcribe("whisper-1", audio_file) | |
#return transcript['text'] | |
pcm_file = os.path.splitext(wav_file)[0] + ".pcm" | |
wav_to_pcm(wav_file, pcm_file) | |
if os.path.exists(pcm_file): | |
print("pcm file exists") | |
else: | |
print("pcm file does not exist") | |
AUDIO_PATH=pcm_file | |
loop = asyncio.get_event_loop() | |
loop.run_until_complete(basic_transcribe()) | |
loop.close() | |
transcript_text = transcriptions[-1] | |
print("final transcribe script: "+transcript_text) | |
return transcript_text | |
def transcribe_func_old(audio): | |
audio_file = open(audio, "rb") | |
file_name = audio_file.name | |
#file_directory = os.path.dirname(audio_file.name) | |
print("audio_file: "+file_name) | |
#transcript = openai.Audio.transcribe("whisper-1", audio_file) | |
#return transcript['text'] | |
# Set up the job parameters | |
job_name = "lingo-demo" | |
text_output_bucket = 'lingo-text-material' #this bucket is in us-west-1 | |
text_output_key = 'transcriptions/question.json' | |
text_output_key = 'transcriptions/'+job_name+'.json' | |
language_code = 'zh-CN' | |
# Upload the file to an S3 bucket | |
audio_input_bucket_name = "lingo-audio-material" | |
audio_input_s3_key = "questions/tmp-question-from-huggingface.wav" | |
s3.upload_file(file_name, audio_input_bucket_name, audio_input_s3_key) | |
# Construct the S3 bucket URI | |
s3_uri = f"s3://{audio_input_bucket_name}/{audio_input_s3_key}" | |
response = transcribe.list_transcription_jobs() | |
# Iterate through the jobs and print their names | |
for job in response['TranscriptionJobSummaries']: | |
print(job['TranscriptionJobName']) | |
if job['TranscriptionJobName'] == job_name: | |
response = transcribe.delete_transcription_job(TranscriptionJobName=job_name) | |
print("delete transcribe job response:"+str(response)) | |
# Create the transcription job | |
response = transcribe.start_transcription_job( | |
TranscriptionJobName=job_name, | |
Media={'MediaFileUri': s3_uri}, | |
MediaFormat='wav', | |
LanguageCode=language_code, | |
OutputBucketName=text_output_bucket, | |
OutputKey=text_output_key | |
) | |
print("start transcribe job response:"+str(response)) | |
job_name = response["TranscriptionJob"]["TranscriptionJobName"] | |
# Wait for the transcription job to complete | |
while True: | |
status = transcribe.get_transcription_job(TranscriptionJobName=job_name)['TranscriptionJob']['TranscriptionJobStatus'] | |
if status in ['COMPLETED', 'FAILED']: | |
break | |
print("Transcription job still in progress...") | |
time.sleep(1) | |
# Get the transcript | |
#transcript = transcribe.get_transcription_job(TranscriptionJobName=job_name) | |
transcript_uri = transcribe.get_transcription_job(TranscriptionJobName=job_name)['TranscriptionJob']['Transcript']['TranscriptFileUri'] | |
print("transcript uri: " + str(transcript_uri)) | |
transcript_file_content = s3.get_object(Bucket=text_output_bucket, Key=text_output_key)['Body'].read().decode('utf-8') | |
print(transcript_file_content) | |
json_data = json.loads(transcript_file_content) | |
# Extract the transcript value | |
transcript_text = json_data['results']['transcripts'][0]['transcript'] | |
return transcript_text | |
def process_audio(audio, history=[]): | |
text = transcribe_func_old(audio) | |
return predict(text, history) | |
with gr.Blocks(css="#chatbot{height:350px} .overflow-y-auto{height:500px}") as demo: | |
chatbot = gr.Chatbot(elem_id="chatbot") | |
state = gr.State([]) | |
with gr.Row(): | |
txt = gr.Textbox(show_label=False, placeholder="Enter text and press enter").style(container=False) | |
with gr.Row(): | |
audio_input = gr.Audio(source="microphone", type="filepath", label="Audio Input") | |
with gr.Row(): | |
audio_output = gr.Audio(type="filepath", label="Audio Output", elem_id="speaker", interactive=False) | |
with gr.Row(): | |
video = gr.HTML(f'<img src="{avatar_url}" width="320" height="240" alt="John Carmack">', live=False) | |
txt.submit(predict, [txt, state], [chatbot, audio_output, video, state]) | |
audio_input.change(process_audio, [audio_input, state], [chatbot, audio_output, video, state]) | |
demo.launch(debug=True) | |