transcript-notetaker / summarizer.py
Eric Botti
updated to use gpt-3.5-turbo, improved streamlit interface
9b23edc
# built in
from io import StringIO
import re
import time
# 3rd party - located in requirements.txt
import streamlit as st
from langchain.text_splitter import RecursiveCharacterTextSplitter
import openai
HEADER_SIZE = 5 # number of lines in the transcript header
CHUNK_SIZE = 2000 # approximate length in characters for each chunk being summarized
TEMPERATURE = 0
def load_transcript(input_file):
"""Load the text from the transcript uploaded using the file uploader widget"""
# transform file from bytes to string
input_string = StringIO(input_file.getvalue().decode('UTF-8'))
# Google Meet Transcripts have a header with info like the meeting title, date, and attendees
# We'll want to extract this information separately, instead of having it passed to a summarizer
file_text = input_string.readlines()
header = file_text[:HEADER_SIZE]
transcript = "".join(file_text[HEADER_SIZE:])
return header, transcript
def chunk_transcript(transcript: str):
# Google Meet transcripts show the timestamp every 5 minutes
# split the transcript on the 5-min timestamps
timestamp_regex_pattern = r"[0-9]{2}:[0-9]{2}:0{2}"
five_minute_chunks = re.split(timestamp_regex_pattern, transcript)
# create a textsplitter to subdivide those chunks into appropriately sized chunks.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE)
# for each 5 minute chunk divide further into sub-chunks of appropriate length
chunks = [text_splitter.split_text(five_minute_chunk) for five_minute_chunk in five_minute_chunks]
# chunks, is a list of lists
# outer list represents 5-minute sections of the meeting
# inner lists representing the subdivisions of that sections that are small enough to be summarized thoroughly
return chunks
def summarize_chunks(five_minute_chunks, user_api_key, debug = False):
"""Create summaries of each chunk of the transcript"""
system_prompt = '''As a professional summarizer, create a concise and comprehensive summary of the provided conversation, while adhering to these guidelines:
1. Craft a summary that is detailed, thorough, in-depth, and complex, while maintaining clarity and conciseness.
2, Incorporate main ideas and essential information, eliminating extraneous language and focusing on critical aspects.
3. Rely strictly on the provided text, without including external information.
4. Format the summary in paragraph form for easy understanding.
5. Do not start the response with "In this conversation", "During this conversation", "During the conversation" or a similar phrase
'''
total_chunks = sum([len(five_minute_chunk) for five_minute_chunk in five_minute_chunks])
number_of_summarized_chunks = 0
progress_bar = st.progress(number_of_summarized_chunks, f"Summarized {number_of_summarized_chunks}/{total_chunks} Chunks...")
five_minute_summaries = []
for sub_chunks in five_minute_chunks:
summaries = []
for chunk in sub_chunks:
if not debug:
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": chunk}
]
response = openai.ChatCompletion.create(
model="gpt-3.5-turbo",
messages=messages,
temperature=TEMPERATURE,
api_key=user_api_key
)
summary = response['choices'][0]['message']['content']
else:
summary = "I would be a meeting note :D"
# update progress bar
number_of_summarized_chunks += 1
progress_bar.progress(number_of_summarized_chunks / total_chunks,
f"Summarized {number_of_summarized_chunks}/{total_chunks} Chunks...")
summaries.append(summary)
five_minute_summaries.append(summaries)
return five_minute_summaries
def format_notes(big_summaries, header):
"""Create a string containing the meeting notes in Markdown format"""
# The header of Google Meet transcripts are always the same structure, so we can manually extract info from them
first_line = re.split(r"[()]", header[0]) # the first line contains both the title and the date
meeting_name = first_line[0]
meeting_date = first_line[1]
attendees = header[2]
meeting_notes = f"# {meeting_name}\n{meeting_date}\n## Attendees\n{attendees}\n## Meeting Notes\n"
for i, summaries in enumerate(big_summaries):
timestamp = time.strftime('%H:%M:%S', time.gmtime(60 * 5 * i))
meeting_notes += f"### {timestamp}\n"
for summary in summaries:
meeting_notes += f"- {summary.strip()}\n"
return meeting_notes