Spaces:
Runtime error
Runtime error
| import os | |
| from pydub import AudioSegment | |
| import openai | |
| from openai import OpenAI | |
| import feedparser | |
| from pathlib import Path | |
| import wikipedia | |
| import json | |
| import streamlit as st | |
| import requests | |
| from docx import Document | |
| from docx.shared import Pt | |
| from docx.enum.text import WD_PARAGRAPH_ALIGNMENT | |
| import io | |
| client = OpenAI() | |
| # def load_whisper_api(audio): | |
| # '''Transcribe YT audio to text using Open AI API''' | |
| # import openai | |
| # file = open(audio, "rb") | |
| # transcript = openai.Audio.translate("whisper-1", file) | |
| # return transcript | |
| def export_to_word(podcast_info,podcast_title): | |
| # Create a new Word document | |
| doc = Document() | |
| doc.add_heading(podcast_title, 0) | |
| # Adding podcast summary | |
| p = doc.add_paragraph() | |
| run = p.add_run("Podcast Summary:\n") | |
| run.bold = True | |
| run.font.size = Pt(12) | |
| p.add_run(podcast_info['podcast_summary']) | |
| # Adding podcast guest details | |
| p = doc.add_paragraph() | |
| run = p.add_run("\nPodcast Guest:\n") | |
| run.bold = True | |
| run.font.size = Pt(12) | |
| p.add_run(podcast_info['podcast_guest']) | |
| # Adding key moments | |
| p = doc.add_paragraph() | |
| run = p.add_run("\nKey Moments:\n") | |
| run.bold = True | |
| run.font.size = Pt(12) | |
| p.add_run(podcast_info['podcast_highlights']) | |
| # Save the document to a byte stream | |
| byte_io = io.BytesIO() | |
| doc.save(byte_io) | |
| byte_io.seek(0) | |
| return byte_io | |
| def load_whisper_api(audio): | |
| '''Transcribe YT audio to text using Open AI API''' | |
| file = open(audio, "rb") | |
| transcript = client.audio.transcriptions.create(model="whisper-1", file=file,response_format="text") | |
| return transcript | |
| def get_transcribe_podcast(rss_url, local_path='/data/'): | |
| st.info("Starting Podcast Transcription Function...") | |
| print("Feed URL: ", rss_url) | |
| print("Local Path:", local_path) | |
| # Download the podcast episode by parsing the RSS feed | |
| p = Path(local_path) | |
| # p.mkdir(exist_ok=True) | |
| st.info("Downloading the podcast episode...") | |
| episode_name = "podcast_episode.mp3" | |
| with requests.get(rss_url, stream=True) as r: | |
| r.raise_for_status() | |
| episode_path = p.joinpath(episode_name) | |
| print(f'episode path {episode_path}') | |
| with open(episode_path, 'wb') as f: | |
| for chunk in r.iter_content(chunk_size=8192): | |
| f.write(chunk) | |
| st.info("Podcast Episode downloaded") | |
| # Perform the transcription | |
| st.info("Starting podcast transcription") | |
| audio_file = episode_path | |
| #Get size of audio file | |
| audio_size = round(os.path.getsize(audio_file)/(1024*1024),1) | |
| print(f'audio size: {audio_size}') | |
| #Check if file is > 24mb, if not then use Whisper API | |
| if audio_size <= 25: | |
| #Use whisper API | |
| results = load_whisper_api(audio_file) | |
| else: | |
| st.info('File size larger than 24mb, applying chunking and transcription') | |
| song = AudioSegment.from_file(audio_file, format='mp3') | |
| # PyDub handles time in milliseconds | |
| twenty_minutes = 20 * 60 * 1000 | |
| chunks = song[::twenty_minutes] | |
| transcriptions = [] | |
| for i, chunk in enumerate(chunks): | |
| chunk.export(f'chunk_{i}.mp3', format='mp3') | |
| transcriptions.append(load_whisper_api(f'chunk_{i}.mp3')) | |
| results = ','.join(transcriptions) | |
| # Return the transcribed text | |
| st.info("Podcast transcription completed, returning results...") | |
| return results | |
| def get_podcast_summary(podcast_transcript): | |
| instructPrompt = """ | |
| You are a podcast analyst and your main task is to summarize the key and important points of | |
| the podcast for a busy professional by highlighting the main and important points | |
| to ensure the professional has a sufficient summary of the podcast. Include any questions you consider important or | |
| any points that warrant further investigation. | |
| Please use bulletpoints. | |
| """ | |
| request = instructPrompt + podcast_transcript | |
| chatOutput = client.chat.completions.create(model="gpt-4-turbo-preview", | |
| messages=[{"role": "system", "content": "You are a helpful podcast analyzer assistant"}, | |
| {"role": "user", "content": request} | |
| ] | |
| ) | |
| podcastSummary = chatOutput.choices[0].message.content | |
| return podcastSummary | |
| def get_podcast_guest(podcast_transcript): | |
| '''Get guest name, professional title, organization name''' | |
| completion = client.chat.completions.create( | |
| model="gpt-4-turbo-preview", | |
| messages=[{"role": "user", "content": podcast_transcript}], | |
| functions=[ | |
| { | |
| "name": "get_podcast_guest_information", | |
| "description": "Get information on the podcast guest using their full name and the name of the organization they are part of to search for them on Wikipedia or Google", | |
| "parameters": { | |
| "type": "object", | |
| "properties": { | |
| "guest_name": { | |
| "type": "string", | |
| "description": "The full name of the guest who is being interviewed in the podcast", | |
| }, | |
| "guest_organization": { | |
| "type": "string", | |
| "description": "The name or details of the organization that the podcast guest belongs to, works for or runs", | |
| }, | |
| "guest_title": { | |
| "type": "string", | |
| "description": "The title, designation or role the podcast guest holds or type of work that the podcast guest in the organization does", | |
| }, | |
| }, | |
| "required": ["guest_name"], | |
| }, | |
| } | |
| ], | |
| function_call={"name": "get_podcast_guest_information"} | |
| ) | |
| podcast_guest = "" | |
| podcast_guest_org = "" | |
| podcast_guest_title = "" | |
| response_message = completion.choices[0].message.function_call | |
| print(f'func res: {response_message}') | |
| if response_message: | |
| function_name = response_message.name | |
| function_args = json.loads(response_message.arguments) | |
| podcast_guest=function_args.get("guest_name") | |
| podcast_guest_org=function_args.get("guest_organization") | |
| podcast_guest_title=function_args.get("guest_title") | |
| return (podcast_guest,podcast_guest_org,podcast_guest_title) | |
| def get_podcast_highlights(podcast_transcript): | |
| instructPrompt = """ | |
| Extract some key moments in the podcast. These are typically interesting insights from the guest or critical questions that the host might have put forward. It could also be a discussion on a hot topic or controversial opinion | |
| """ | |
| request = instructPrompt + podcast_transcript | |
| chatOutput = client.chat.completions.create(model="gpt-4-turbo-preview", | |
| messages=[{"role": "system", "content": "You are a helpful assistant."}, | |
| {"role": "user", "content": podcast_transcript} | |
| ] | |
| ) | |
| podcastHighlights = chatOutput.choices[0].message.content | |
| return podcastHighlights | |
| def process_podcast(url, path='/data/'): | |
| '''Get podcast transcription into json''' | |
| output = {} | |
| podcast_details = get_transcribe_podcast(url, path) | |
| podcast_summary = get_podcast_summary(podcast_details) | |
| podcast_guest_details = get_podcast_guest(podcast_details) | |
| podcast_highlights = get_podcast_highlights(podcast_details) | |
| output['podcast_details'] = podcast_details | |
| output['podcast_summary'] = podcast_summary | |
| output['podcast_guest'] = podcast_guest_details[0] | |
| output['podcast_guest_org'] = podcast_guest_details[1] | |
| output['podcast_guest_title'] = podcast_guest_details[2] | |
| output['podcast_highlights'] = podcast_highlights | |
| return output |