PodcastGPT / functions.py
nickmuchi's picture
Create functions.py
cc8b68a verified
raw
history blame
7 kB
import os
from pydub import AudioSegment
import openai
from openai import OpenAI
import feedparser
from pathlib import Path
import wikipedia
import json
openai_audio = OpenAI()
# def load_whisper_api(audio):
# '''Transcribe YT audio to text using Open AI API'''
# import openai
# file = open(audio, "rb")
# transcript = openai.Audio.translate("whisper-1", file)
# return transcript
@st.cache_data
def load_whisper_api(audio):
'''Transcribe YT audio to text using Open AI API'''
file = open(audio, "rb")
transcript = openai_audio.audio.transcriptions.create(model="whisper-1", file=file,response_format="text")
return transcript
@st.cache_data
def get_transcribe_podcast(rss_url, local_path):
st.info("Starting Podcast Transcription Function...")
print("Feed URL: ", rss_url)
print("Local Path:", local_path)
# Download the podcast episode by parsing the RSS feed
p = Path(local_path)
p.mkdir(exist_ok=True)
st.info("Downloading the podcast episode...")
with requests.get(rss_url, stream=True) as r:
r.raise_for_status()
episode_path = p.joinpath(episode_name)
with open(episode_path, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
f.write(chunk)
st.info("Podcast Episode downloaded")
# Perform the transcription
st.info("Starting podcast transcription")
audio_file = local_path + episode_name
#Get size of audio file
audio_size = round(os.path.getsize(audio_file)/(1024*1024),1)
#Check if file is > 24mb, if not then use Whisper API
if audio_size <= 25:
#Use whisper API
results = load_whisper_api(audio_file)['text']
else:
st.info('File size larger than 24mb, applying chunking and transcription')
song = AudioSegment.from_file(audio_file, format='mp3')
# PyDub handles time in milliseconds
twenty_minutes = 20 * 60 * 1000
chunks = song[::twenty_minutes]
transcriptions = []
for i, chunk in enumerate(chunks):
chunk.export(f'chunk_{i}.mp3', format='mp3')
transcriptions.append(load_whisper_api(f'chunk_{i}.mp3')['text'])
results = ','.join(transcriptions)
# Return the transcribed text
st.info("Podcast transcription completed, returning results...")
return results
@st.cache_data
def get_podcast_summary(podcast_transcript):
instructPrompt = """
You are a podcast analyst and your main task is to summarize the key and important points of
the podcast for a busy professional by highlighting the main and important points
to ensure the professional has a sufficient summary of the podcast. Include any questions you consider important or
any points that warrant further investigation.
Please use bulletpoints.
"""
request = instructPrompt + podcast_transcript
chatOutput = openai.ChatCompletion.create(model="gpt-3.5-turbo-16k",
messages=[{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": request}
]
)
podcastSummary = chatOutput.choices[0].message.content
return podcastSummary
@st.cache_data
def get_podcast_guest(podcast_transcript):
'''Get guest name, professional title, organization name'''
completion = openai.ChatCompletion.create(
model="gpt-3.5-turbo-16k",
messages=[{"role": "user", "content": podcast_transcript}],
functions=[
{
"name": "get_podcast_guest_information",
"description": "Get information on the podcast guest using their full name and the name of the organization they are part of to search for them on Wikipedia or Google",
"parameters": {
"type": "object",
"properties": {
"guest_name": {
"type": "string",
"description": "The full name of the guest who is being interviewed in the podcast",
},
"guest_organization": {
"type": "string",
"description": "The name or details of the organization that the podcast guest belongs to, works for or runs",
},
"guest_title": {
"type": "string",
"description": "The title, designation or role the podcast guest holds or type of work that the podcast guest in the organization does",
},
},
"required": ["guest_name"],
},
}
],
function_call={"name": "get_podcast_guest_information"}
)
podcast_guest = ""
podcast_guest_org = ""
podcast_guest_title = ""
response_message = completion["choices"][0]["message"]
if response_message.get("function_call"):
function_name = response_message["function_call"]["name"]
function_args = json.loads(response_message["function_call"]["arguments"])
podcast_guest=function_args.get("guest_name")
podcast_guest_org=function_args.get("guest_organization")
podcast_guest_title=function_args.get("guest_title")
return (podcast_guest,podcast_guest_org,podcast_guest_title)
@st.cache_data
def get_podcast_highlights(podcast_transcript):
instructPrompt = """
Extract some key moments in the podcast. These are typically interesting insights from the guest or critical questions that the host might have put forward. It could also be a discussion on a hot topic or controversial opinion
"""
request = instructPrompt + podcast_transcript
chatOutput = openai.ChatCompletion.create(model="gpt-3.5-turbo-16k",
messages=[{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": podcast_transcript}
]
)
podcastHighlights = chatOutput.choices[0].message.content
return podcastHighlights
@st.cache_data
def process_podcast(url, path):
'''Get podcast transcription into json'''
output = {}
podcast_details = get_transcribe_podcast.call(url, path)
podcast_summary = get_podcast_summary.call(podcast_details)
podcast_guest_details = get_podcast_guest.call(podcast_details)
podcast_highlights = get_podcast_highlights.call(podcast_details)
output['podcast_details'] = podcast_details
output['podcast_summary'] = podcast_summary
output['podcast_guest'] = podcast_guest_details[0]
output['podcast_guest_org'] = podcast_guest_details[1]
output['podcast_guest_title'] = podcast_guest_details[2]
output['podcast_highlights'] = podcast_highlights
return output