Blake
Added initial project
5af9455
from youtube_transcript_api import YouTubeTranscriptApi
import openai
from urllib.parse import urlparse, parse_qs
import requests
from requests.auth import HTTPBasicAuth
import os
import logging
logging.basicConfig(filename='app.log', filemode='a',
format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG)
def get_video_id_from_url(url):
"""
Extracts the YouTube video ID from a given URL.
Supports both 'youtube.com' and 'youtu.be' URL formats. For 'youtube.com', it looks for the 'v' query parameter.
For 'youtu.be', it extracts the ID directly from the path.
Parameters:
url (str): The full URL of the YouTube video.
Returns:
str: The extracted video ID if found, otherwise None.
Note:
This function silently handles exceptions and returns None if the video ID cannot be extracted.
"""
try:
url_data = urlparse(url)
if url_data.hostname == 'www.youtube.com' or url_data.hostname == 'youtube.com':
query = parse_qs(url_data.query)
video_id = query.get("v")
if video_id:
#logging.info(f"Video ID {video_id[0]} extracted from URL.")
return video_id[0]
elif url_data.hostname == 'youtu.be':
# Extract the video ID from the path for youtu.be URLs
video_id = url_data.path[1:] # Remove the leading '/'
if video_id:
#logging.info(f"Video ID {video_id} extracted from URL.")
return video_id
#logging.warning(f"No video ID found in URL: {url}")
return None
except Exception:
#logging.error(f"Error extracting video ID from URL {url}: {e}")
return None
def get_first_youtube_video_url(urls):
"""
Finds and returns the first YouTube video URL from a list of URLs.
Iterates over a provided list of URLs, checking each for a substring that matches
'youtube' or 'youtu.be'. Returns the first URL that matches these criteria.
Parameters:
urls (list of str): A list containing URLs to be checked.
Returns:
str: The first YouTube video URL found in the list, or None if no YouTube URL is found.
"""
for url in urls:
if 'youtube' in url or 'youtu.be' in url:
return url
return None
def get_youtube_url(opportunity_id):
"""
Retrieves the YouTube video URL associated with a given opportunity ID from the Lever API.
This function makes a GET request to the Lever API to fetch the opportunity details using the provided
opportunity ID. It then extracts and returns the first YouTube video URL found in the 'links' section
of the opportunity data.
Parameters:
opportunity_id (str): The unique identifier for the opportunity in the Lever system.
Returns:
str: The YouTube video URL associated with the opportunity, or None if no YouTube URL is found.
Note:
Requires the 'LeverKey' environment variable to be set for authentication with the Lever API.
"""
url = 'https://api.lever.co/v1/opportunities/{}'.format(opportunity_id)
response = requests.get(url, auth=HTTPBasicAuth(os.getenv('LeverKey'),''))
links = response.json()['data']['links']
youtube_link = get_first_youtube_video_url(links)
return youtube_link
def parse_decision_to_binary(decision_text):
"""
Converts a decision text to a binary outcome based on the presence of the word 'yes'.
This function checks if the word 'yes' is present in the provided decision text, performing
a case-insensitive comparison. It is designed to interpret a textual decision as a binary
outcome, where the presence of 'yes' indicates a positive (True) decision, and its absence
indicates a negative (False) decision.
Parameters:
decision_text (str): The decision text to be analyzed.
Returns:
bool: True if 'yes' is present in the decision text, False otherwise.
"""
decision_text_lower = decision_text.lower()
return "yes" in decision_text_lower
def get_transcript_data_and_pause_count(video_id):
"""
Fetches a video's transcript, calculates its total duration in minutes, and counts pauses between segments.
Utilizes the YouTubeTranscriptApi to retrieve the English transcript of a video given its ID, then analyzes
the transcript to determine the total duration and estimate the number of pauses based on gaps between
transcript segments.
Parameters:
video_id (str): The unique identifier of the YouTube video.
Returns:
tuple: A tuple containing the full transcript text (str), total duration in minutes (int),
and the estimated number of pauses (int), or (None, None, None) if an error occurs.
"""
try:
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en'])
if transcript:
last_segment = transcript[-1]
total_duration = last_segment['start'] + last_segment['duration']
# Estimate the number of pauses
pauses = 0
for i in range(1, len(transcript)):
current_start = transcript[i]['start']
previous_end = transcript[i-1]['start'] + transcript[i-1]['duration']
if current_start > previous_end:
pauses += 1
full_transcript = " ".join(segment['text'] for segment in transcript)
logging.info(f"Transcript retrieved successfully for video ID {video_id}.")
return full_transcript, total_duration // 60, pauses
except Exception as e:
logging.error(f"Failed to retrieve transcript for video ID {video_id}. Error: {e}")
return None, None, None
def analyze_transcript(url):
"""
Analyzes a YouTube video's transcript for content quality, using a predefined prompt for GPT evaluation.
This function reads a prompt from 'prompt.txt', extracts the video ID from the provided URL, retrieves the
video's transcript and its analysis metrics (total duration and pauses), and evaluates these metrics against
a GPT model to determine if the candidate qualifies for an interview.
Parameters:
url (str): The URL of the YouTube video to be analyzed.
Returns:
str: A message indicating whether the candidate qualifies for an interview, an error message if the
video URL is invalid or the transcript could not be retrieved, or a detailed error message if
any other error occurs during processing.
"""
try:
with open('prompt.txt', 'r') as file:
prompt = file.read()
except Exception as e:
logging.error(f"Error opening or reading from 'prompt.txt': {e}")
return "Error processing the prompt file."
try:
video_id = get_video_id_from_url(url)
if not video_id:
logging.error("Invalid URL provided.")
return "Unable to process the video URL. Currently only YouTube URLs are accepted."
full_transcript, total_duration, pauses = get_transcript_data_and_pause_count(
video_id)
if full_transcript is None: # If there was an error retrieving the transcript
logging.error("Error retrieving the transcript.")
return pauses
# Define the prompt for GPT evaluation based on the rubric
prompt = prompt.format(full_transcript, pauses, total_duration)
# Using the new OpenAI client structure
client = openai.OpenAI(api_key=os.getenv('OpenAIKey'))
response = client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": prompt}
],
)
decision = parse_decision_to_binary(response.choices[0].message.content.strip())
if decision:
return "The candidate qualifies for an interview."
return "The candidate does not qualify for an interview."
except Exception as e:
logging.error(f"An error occurred during the analysis: {e}")
return f"An error occurred during the processing. {e}"