Spaces:

nickmuchi
/

PodcastGPT

Sleeping

App Files Files Community

PodcastGPT / functions.py

nickmuchi

Create functions.py

cc8b68a verified over 1 year ago

raw

history blame

7 kB

	import os
	from pydub import AudioSegment
	import openai
	from openai import OpenAI
	import feedparser
	from pathlib import Path
	import wikipedia
	import json


	openai_audio = OpenAI()

	# def load_whisper_api(audio):

	# '''Transcribe YT audio to text using Open AI API'''

	# import openai
	# file = open(audio, "rb")
	# transcript = openai.Audio.translate("whisper-1", file)

	# return transcript

	@st.cache_data
	def load_whisper_api(audio):

	'''Transcribe YT audio to text using Open AI API'''
	file = open(audio, "rb")
	transcript = openai_audio.audio.transcriptions.create(model="whisper-1", file=file,response_format="text")

	return transcript

	@st.cache_data
	def get_transcribe_podcast(rss_url, local_path):

	st.info("Starting Podcast Transcription Function...")
	print("Feed URL: ", rss_url)
	print("Local Path:", local_path)

	# Download the podcast episode by parsing the RSS feed
	p = Path(local_path)
	p.mkdir(exist_ok=True)

	st.info("Downloading the podcast episode...")

	with requests.get(rss_url, stream=True) as r:
	r.raise_for_status()
	episode_path = p.joinpath(episode_name)

	with open(episode_path, 'wb') as f:
	for chunk in r.iter_content(chunk_size=8192):
	f.write(chunk)

	st.info("Podcast Episode downloaded")

	# Perform the transcription
	st.info("Starting podcast transcription")

	audio_file = local_path + episode_name


	#Get size of audio file
	audio_size = round(os.path.getsize(audio_file)/(1024*1024),1)

	#Check if file is > 24mb, if not then use Whisper API
	if audio_size <= 25:

	#Use whisper API
	results = load_whisper_api(audio_file)['text']

	else:

	st.info('File size larger than 24mb, applying chunking and transcription')

	song = AudioSegment.from_file(audio_file, format='mp3')

	# PyDub handles time in milliseconds
	twenty_minutes = 20 * 60 * 1000

	chunks = song[::twenty_minutes]

	transcriptions = []

	for i, chunk in enumerate(chunks):
	chunk.export(f'chunk_{i}.mp3', format='mp3')
	transcriptions.append(load_whisper_api(f'chunk_{i}.mp3')['text'])

	results = ','.join(transcriptions)

	# Return the transcribed text
	st.info("Podcast transcription completed, returning results...")

	return results

	@st.cache_data
	def get_podcast_summary(podcast_transcript):

	instructPrompt = """
	You are a podcast analyst and your main task is to summarize the key and important points of
	the podcast for a busy professional by highlighting the main and important points
	to ensure the professional has a sufficient summary of the podcast. Include any questions you consider important or
	any points that warrant further investigation.

	Please use bulletpoints.

	"""

	request = instructPrompt + podcast_transcript

	chatOutput = openai.ChatCompletion.create(model="gpt-3.5-turbo-16k",
	messages=[{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": request}
	]
	)

	podcastSummary = chatOutput.choices[0].message.content

	return podcastSummary

	@st.cache_data
	def get_podcast_guest(podcast_transcript):
	'''Get guest name, professional title, organization name'''

	completion = openai.ChatCompletion.create(
	model="gpt-3.5-turbo-16k",
	messages=[{"role": "user", "content": podcast_transcript}],
	functions=[

	{
	"name": "get_podcast_guest_information",
	"description": "Get information on the podcast guest using their full name and the name of the organization they are part of to search for them on Wikipedia or Google",
	"parameters": {
	"type": "object",
	"properties": {
	"guest_name": {
	"type": "string",
	"description": "The full name of the guest who is being interviewed in the podcast",
	},
	"guest_organization": {
	"type": "string",
	"description": "The name or details of the organization that the podcast guest belongs to, works for or runs",
	},
	"guest_title": {
	"type": "string",
	"description": "The title, designation or role the podcast guest holds or type of work that the podcast guest in the organization does",
	},
	},
	"required": ["guest_name"],
	},
	}
	],
	function_call={"name": "get_podcast_guest_information"}
	)

	podcast_guest = ""
	podcast_guest_org = ""
	podcast_guest_title = ""
	response_message = completion["choices"][0]["message"]

	if response_message.get("function_call"):

	function_name = response_message["function_call"]["name"]
	function_args = json.loads(response_message["function_call"]["arguments"])
	podcast_guest=function_args.get("guest_name")
	podcast_guest_org=function_args.get("guest_organization")
	podcast_guest_title=function_args.get("guest_title")

	return (podcast_guest,podcast_guest_org,podcast_guest_title)

	@st.cache_data
	def get_podcast_highlights(podcast_transcript):

	instructPrompt = """
	Extract some key moments in the podcast. These are typically interesting insights from the guest or critical questions that the host might have put forward. It could also be a discussion on a hot topic or controversial opinion
	"""
	request = instructPrompt + podcast_transcript

	chatOutput = openai.ChatCompletion.create(model="gpt-3.5-turbo-16k",
	messages=[{"role": "system", "content": "You are a helpful assistant."},
	{"role": "user", "content": podcast_transcript}
	]
	)

	podcastHighlights = chatOutput.choices[0].message.content

	return podcastHighlights

	@st.cache_data
	def process_podcast(url, path):

	'''Get podcast transcription into json'''

	output = {}
	podcast_details = get_transcribe_podcast.call(url, path)
	podcast_summary = get_podcast_summary.call(podcast_details)
	podcast_guest_details = get_podcast_guest.call(podcast_details)
	podcast_highlights = get_podcast_highlights.call(podcast_details)
	output['podcast_details'] = podcast_details
	output['podcast_summary'] = podcast_summary
	output['podcast_guest'] = podcast_guest_details[0]
	output['podcast_guest_org'] = podcast_guest_details[1]
	output['podcast_guest_title'] = podcast_guest_details[2]
	output['podcast_highlights'] = podcast_highlights

	return output