# built in from io import StringIO import re import time # 3rd party - located in requirements.txt import streamlit as st from langchain.text_splitter import RecursiveCharacterTextSplitter import openai HEADER_SIZE = 5 # number of lines in the transcript header CHUNK_SIZE = 2000 # approximate length in characters for each chunk being summarized TEMPERATURE = 0 def load_transcript(input_file): """Load the text from the transcript uploaded using the file uploader widget""" # transform file from bytes to string input_string = StringIO(input_file.getvalue().decode('UTF-8')) # Google Meet Transcripts have a header with info like the meeting title, date, and attendees # We'll want to extract this information separately, instead of having it passed to a summarizer file_text = input_string.readlines() header = file_text[:HEADER_SIZE] transcript = "".join(file_text[HEADER_SIZE:]) return header, transcript def chunk_transcript(transcript: str): # Google Meet transcripts show the timestamp every 5 minutes # split the transcript on the 5-min timestamps timestamp_regex_pattern = r"[0-9]{2}:[0-9]{2}:0{2}" five_minute_chunks = re.split(timestamp_regex_pattern, transcript) # create a textsplitter to subdivide those chunks into appropriately sized chunks. text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE) # for each 5 minute chunk divide further into sub-chunks of appropriate length chunks = [text_splitter.split_text(five_minute_chunk) for five_minute_chunk in five_minute_chunks] # chunks, is a list of lists # outer list represents 5-minute sections of the meeting # inner lists representing the subdivisions of that sections that are small enough to be summarized thoroughly return chunks def summarize_chunks(five_minute_chunks, user_api_key, debug = False): """Create summaries of each chunk of the transcript""" system_prompt = '''As a professional summarizer, create a concise and comprehensive summary of the provided conversation, while adhering to these guidelines: 1. Craft a summary that is detailed, thorough, in-depth, and complex, while maintaining clarity and conciseness. 2, Incorporate main ideas and essential information, eliminating extraneous language and focusing on critical aspects. 3. Rely strictly on the provided text, without including external information. 4. Format the summary in paragraph form for easy understanding. 5. Do not start the response with "In this conversation", "During this conversation", "During the conversation" or a similar phrase ''' total_chunks = sum([len(five_minute_chunk) for five_minute_chunk in five_minute_chunks]) number_of_summarized_chunks = 0 progress_bar = st.progress(number_of_summarized_chunks, f"Summarized {number_of_summarized_chunks}/{total_chunks} Chunks...") five_minute_summaries = [] for sub_chunks in five_minute_chunks: summaries = [] for chunk in sub_chunks: if not debug: messages = [ {"role": "system", "content": system_prompt}, {"role": "user", "content": chunk} ] response = openai.ChatCompletion.create( model="gpt-3.5-turbo", messages=messages, temperature=TEMPERATURE, api_key=user_api_key ) summary = response['choices'][0]['message']['content'] else: summary = "I would be a meeting note :D" # update progress bar number_of_summarized_chunks += 1 progress_bar.progress(number_of_summarized_chunks / total_chunks, f"Summarized {number_of_summarized_chunks}/{total_chunks} Chunks...") summaries.append(summary) five_minute_summaries.append(summaries) return five_minute_summaries def format_notes(big_summaries, header): """Create a string containing the meeting notes in Markdown format""" # The header of Google Meet transcripts are always the same structure, so we can manually extract info from them first_line = re.split(r"[()]", header[0]) # the first line contains both the title and the date meeting_name = first_line[0] meeting_date = first_line[1] attendees = header[2] meeting_notes = f"# {meeting_name}\n{meeting_date}\n## Attendees\n{attendees}\n## Meeting Notes\n" for i, summaries in enumerate(big_summaries): timestamp = time.strftime('%H:%M:%S', time.gmtime(60 * 5 * i)) meeting_notes += f"### {timestamp}\n" for summary in summaries: meeting_notes += f"- {summary.strip()}\n" return meeting_notes