Spaces:
Runtime error
Runtime error
from youtube_transcript_api import YouTubeTranscriptApi | |
import openai | |
from urllib.parse import urlparse, parse_qs | |
import requests | |
from requests.auth import HTTPBasicAuth | |
import os | |
import logging | |
logging.basicConfig(filename='app.log', filemode='a', | |
format='%(name)s - %(levelname)s - %(message)s', level=logging.DEBUG) | |
def get_video_id_from_url(url): | |
""" | |
Extracts the YouTube video ID from a given URL. | |
Supports both 'youtube.com' and 'youtu.be' URL formats. For 'youtube.com', it looks for the 'v' query parameter. | |
For 'youtu.be', it extracts the ID directly from the path. | |
Parameters: | |
url (str): The full URL of the YouTube video. | |
Returns: | |
str: The extracted video ID if found, otherwise None. | |
Note: | |
This function silently handles exceptions and returns None if the video ID cannot be extracted. | |
""" | |
try: | |
url_data = urlparse(url) | |
if url_data.hostname == 'www.youtube.com' or url_data.hostname == 'youtube.com': | |
query = parse_qs(url_data.query) | |
video_id = query.get("v") | |
if video_id: | |
#logging.info(f"Video ID {video_id[0]} extracted from URL.") | |
return video_id[0] | |
elif url_data.hostname == 'youtu.be': | |
# Extract the video ID from the path for youtu.be URLs | |
video_id = url_data.path[1:] # Remove the leading '/' | |
if video_id: | |
#logging.info(f"Video ID {video_id} extracted from URL.") | |
return video_id | |
#logging.warning(f"No video ID found in URL: {url}") | |
return None | |
except Exception: | |
#logging.error(f"Error extracting video ID from URL {url}: {e}") | |
return None | |
def get_first_youtube_video_url(urls): | |
""" | |
Finds and returns the first YouTube video URL from a list of URLs. | |
Iterates over a provided list of URLs, checking each for a substring that matches | |
'youtube' or 'youtu.be'. Returns the first URL that matches these criteria. | |
Parameters: | |
urls (list of str): A list containing URLs to be checked. | |
Returns: | |
str: The first YouTube video URL found in the list, or None if no YouTube URL is found. | |
""" | |
for url in urls: | |
if 'youtube' in url or 'youtu.be' in url: | |
return url | |
return None | |
def get_youtube_url(opportunity_id): | |
""" | |
Retrieves the YouTube video URL associated with a given opportunity ID from the Lever API. | |
This function makes a GET request to the Lever API to fetch the opportunity details using the provided | |
opportunity ID. It then extracts and returns the first YouTube video URL found in the 'links' section | |
of the opportunity data. | |
Parameters: | |
opportunity_id (str): The unique identifier for the opportunity in the Lever system. | |
Returns: | |
str: The YouTube video URL associated with the opportunity, or None if no YouTube URL is found. | |
Note: | |
Requires the 'LeverKey' environment variable to be set for authentication with the Lever API. | |
""" | |
url = 'https://api.lever.co/v1/opportunities/{}'.format(opportunity_id) | |
response = requests.get(url, auth=HTTPBasicAuth(os.getenv('LeverKey'),'')) | |
links = response.json()['data']['links'] | |
youtube_link = get_first_youtube_video_url(links) | |
return youtube_link | |
def parse_decision_to_binary(decision_text): | |
""" | |
Converts a decision text to a binary outcome based on the presence of the word 'yes'. | |
This function checks if the word 'yes' is present in the provided decision text, performing | |
a case-insensitive comparison. It is designed to interpret a textual decision as a binary | |
outcome, where the presence of 'yes' indicates a positive (True) decision, and its absence | |
indicates a negative (False) decision. | |
Parameters: | |
decision_text (str): The decision text to be analyzed. | |
Returns: | |
bool: True if 'yes' is present in the decision text, False otherwise. | |
""" | |
decision_text_lower = decision_text.lower() | |
return "yes" in decision_text_lower | |
def get_transcript_data_and_pause_count(video_id): | |
""" | |
Fetches a video's transcript, calculates its total duration in minutes, and counts pauses between segments. | |
Utilizes the YouTubeTranscriptApi to retrieve the English transcript of a video given its ID, then analyzes | |
the transcript to determine the total duration and estimate the number of pauses based on gaps between | |
transcript segments. | |
Parameters: | |
video_id (str): The unique identifier of the YouTube video. | |
Returns: | |
tuple: A tuple containing the full transcript text (str), total duration in minutes (int), | |
and the estimated number of pauses (int), or (None, None, None) if an error occurs. | |
""" | |
try: | |
transcript = YouTubeTranscriptApi.get_transcript(video_id, languages=['en']) | |
if transcript: | |
last_segment = transcript[-1] | |
total_duration = last_segment['start'] + last_segment['duration'] | |
# Estimate the number of pauses | |
pauses = 0 | |
for i in range(1, len(transcript)): | |
current_start = transcript[i]['start'] | |
previous_end = transcript[i-1]['start'] + transcript[i-1]['duration'] | |
if current_start > previous_end: | |
pauses += 1 | |
full_transcript = " ".join(segment['text'] for segment in transcript) | |
logging.info(f"Transcript retrieved successfully for video ID {video_id}.") | |
return full_transcript, total_duration // 60, pauses | |
except Exception as e: | |
logging.error(f"Failed to retrieve transcript for video ID {video_id}. Error: {e}") | |
return None, None, None | |
def analyze_transcript(url): | |
""" | |
Analyzes a YouTube video's transcript for content quality, using a predefined prompt for GPT evaluation. | |
This function reads a prompt from 'prompt.txt', extracts the video ID from the provided URL, retrieves the | |
video's transcript and its analysis metrics (total duration and pauses), and evaluates these metrics against | |
a GPT model to determine if the candidate qualifies for an interview. | |
Parameters: | |
url (str): The URL of the YouTube video to be analyzed. | |
Returns: | |
str: A message indicating whether the candidate qualifies for an interview, an error message if the | |
video URL is invalid or the transcript could not be retrieved, or a detailed error message if | |
any other error occurs during processing. | |
""" | |
try: | |
with open('prompt.txt', 'r') as file: | |
prompt = file.read() | |
except Exception as e: | |
logging.error(f"Error opening or reading from 'prompt.txt': {e}") | |
return "Error processing the prompt file." | |
try: | |
video_id = get_video_id_from_url(url) | |
if not video_id: | |
logging.error("Invalid URL provided.") | |
return "Unable to process the video URL. Currently only YouTube URLs are accepted." | |
full_transcript, total_duration, pauses = get_transcript_data_and_pause_count( | |
video_id) | |
if full_transcript is None: # If there was an error retrieving the transcript | |
logging.error("Error retrieving the transcript.") | |
return pauses | |
# Define the prompt for GPT evaluation based on the rubric | |
prompt = prompt.format(full_transcript, pauses, total_duration) | |
# Using the new OpenAI client structure | |
client = openai.OpenAI(api_key=os.getenv('OpenAIKey')) | |
response = client.chat.completions.create( | |
model="gpt-4", | |
messages=[ | |
{"role": "system", "content": "You are a helpful assistant."}, | |
{"role": "user", "content": prompt} | |
], | |
) | |
decision = parse_decision_to_binary(response.choices[0].message.content.strip()) | |
if decision: | |
return "The candidate qualifies for an interview." | |
return "The candidate does not qualify for an interview." | |
except Exception as e: | |
logging.error(f"An error occurred during the analysis: {e}") | |
return f"An error occurred during the processing. {e}" |