# Digital Mentor



In [1]:
import os
from base64 import b64encode
import time
import torch
import utils
import api_utils
from openai import OpenAI
from moviepy.editor import VideoFileClip
from IPython.display import display, HTML, clear_output
import elevenlabs as elevlabs

## Set paths to media files

In [2]:
# What character to use
clone_to_use = "Albert" # "Steve"

# Path to the media directory containing
# the avatar image, welcome video and goodbye videos
path = f"./media/"
 
input_video = path + f"{clone_to_use}/image.jpg"
presentation_video = path + f"{clone_to_use}/presentation.mp4"
goodbye_video = path + f"{clone_to_use}/goodbye.mp4"
results_path = path + f"{clone_to_use}/results/result.mp4"
w_vid = h_vid = "90%"

print(clone_to_use)
if os.path.isfile(input_video):
 print(f"Clone image found: {input_video}")
else:
 print(f"CLONE IMAGE NOT FOUND: {input_video}")
 
if os.path.isfile(presentation_video):
 print(f"Clone video found: {presentation_video}")
else:
 print(f"CLONE VIDEO NOT FOUND: {presentation_video}")
 
if os.path.isfile(goodbye_video):
 print(f"Clone goodbye video found: {goodbye_video}")
else:
 print(f"CLONE GOODBYE NOT FOUND: {goodbye_video}")


Albert
Clone image found: ./media/Albert/image.jpg
Clone video found: ./media/Albert/presentation.mp4
Clone goodbye video found: ./media/Albert/goodbye.mp4


## Set GPT API

In [3]:
openai_api_key = os.environ.get('OPENAI_API_KEY')
openai_client = OpenAI(api_key=openai_api_key)

openai_model = "gpt-3.5-turbo-16k" #"gpt-3.5-turbo-16k" or"gpt-4"

system_steve = """You are a digital clone of Steve Jobs, I have put you his voice. 
 People will ask you questions and you will respond as if you were steve jobs
 Please speak as if you were Steve Jobs. You are not a large language model but
 a Steve Jobs digital clone. Try to be brief with your responses."""
system_albert = """You are a digital clone of Albert Einstein, I have put you his voice. 
 People will ask you questions and you will respond as if you were Albert Einstein
 Please speak as if you were Albert Einstein. You are NOT a large language model but
 an Albert Einstein digital clone. Try to be brief with your responses."""

if clone_to_use == "Steve":
 system_to_use = system_steve
elif clone_to_use == "Albert":
 system_to_use = system_albert
 
messages = []

def set_gpt_system(messages, system_msg):
 messages.append({"role": "system", "content": system_to_use})
 return messages

# Set GPT
messages = set_gpt_system(messages, system_to_use)

## Set text-to-audio motor (Eleven labs)

In [4]:
eleven_api_key = os.environ.get('ELEVEN_LABS_KEY')

# Configure GPT and Text-to-speech API keys
elevlabs.set_api_key(eleven_api_key)

# Configure voice
voice_list = elevlabs.voices()
voice_labels = [voice.category + " voice: " + voice.name for voice in voice_list]
print("Existing voices:")
print(voice_labels)

# Select voice to use
if clone_to_use == "Steve":
 voice_id = f"cloned voice: {clone_to_use}" 
else:
 voice_id = f"generated voice: {clone_to_use}" 
selected_voice_index = voice_labels.index(voice_id)
selected_voice_id = voice_list[selected_voice_index].voice_id

print(f"\nSelected voice: {voice_id}")

Existing voices:
['premade voice: Rachel', 'premade voice: Drew', 'premade voice: Clyde', 'premade voice: Paul', 'premade voice: Domi', 'premade voice: Dave', 'premade voice: Fin', 'premade voice: Sarah', 'premade voice: Antoni', 'premade voice: Thomas', 'premade voice: Charlie', 'premade voice: George', 'premade voice: Emily', 'premade voice: Elli', 'premade voice: Callum', 'premade voice: Patrick', 'premade voice: Harry', 'premade voice: Liam', 'premade voice: Dorothy', 'premade voice: Josh', 'premade voice: Arnold', 'premade voice: Charlotte', 'premade voice: Alice', 'premade voice: Matilda', 'premade voice: Matthew', 'premade voice: James', 'premade voice: Joseph', 'premade voice: Jeremy', 'premade voice: Michael', 'premade voice: Ethan', 'premade voice: Chris', 'premade voice: Gigi', 'premade voice: Freya', 'premade voice: Brian', 'premade voice: Grace', 'premade voice: Daniel', 'premade voice: Lily', 'premade voice: Serena', 'premade voice: Adam', 'premade voice: Nicole', 'premad

## Load Input image and wav2lip model

In [5]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using {device}")
frames, fps = utils.load_input_image_or_video(input_video)

# Loading lip model
model = utils.load_lip_model(device=device)

Using cuda
Reading video frames...
Load checkpoint from: checkpoints/wav2lip_gan.pth


## Increase size of input prompt

In [6]:
display(HTML("""

"""))


In [7]:
def display_image(image_path, width="75%", height="75%"):
 with open(image_path,'rb') as f:
 image = f.read()
 data_url = "data:image/jpg;base64," + b64encode(image).decode()
 html = HTML(f'')
 display(html)
 
 
def get_video_duration(video_path):
 clip = VideoFileClip(video_path)
 duration = clip.duration # duration is in seconds
 return duration
 
 
def display_video(results_path, autoplay=False, width="90%", height="90%"):
 mp4 = open(results_path,'rb').read()
 data_url = "data:video/mp4;base64," + b64encode(mp4).decode()
 autoplay_attr = "autoplay" if autoplay else ""
 html = HTML(f"""
 """)
 display(html)

 if autoplay:
 # Get video duration
 video_duration = get_video_duration(results_path) + 4

 # Pause the cell execution until the video finishes
 time.sleep(video_duration)
 

# Function to continuously interact with GPT-4
def interaction(messages):
 display_video(presentation_video, autoplay=True, width=w_vid, height=h_vid)
 interaction_count = 0
 
 while True:
 if interaction_count > 0:
 clear_output(wait=True)
 display_video(presentation_video, autoplay=False, width=w_vid, height=h_vid)
 prompt = input("Enter your prompt (or type 'exit' to stop): ")
 if prompt.lower() == 'exit':
 clear_output(wait=True)
 display_video(goodbye_video, autoplay=True, width=w_vid, height=h_vid)
 break
 # Get GPT text response
 response_text, messages = api_utils.get_text_response(openai_client,
 openai_model,
 prompt, messages)
 
 # Convert text response to audio file
 audio_file = api_utils.text_to_audio(eleven_api_key, selected_voice_id,
 response_text)

 audio, audio_file = utils.load_input_audio(file_path=audio_file, fps=fps, results_path=results_path)
 utils.animate_input(frames, audio, audio_file, fps, model, device, results_path)
 clear_output(wait=True)
 display_video(results_path, autoplay=True, width=w_vid, height=h_vid)
 interaction_count += 1

# Mentor Digital

In [8]:
interaction(messages)