Spaces:

ericbotti
/

transcript-notetaker

Runtime error

transcript-notetaker / summarizer.py

Eric Botti

updated to use gpt-3.5-turbo, improved streamlit interface

9b23edc almost 2 years ago

4.82 kB

	# built in
	from io import StringIO
	import re
	import time
	# 3rd party - located in requirements.txt
	import streamlit as st
	from langchain.text_splitter import RecursiveCharacterTextSplitter
	import openai

	HEADER_SIZE = 5 # number of lines in the transcript header
	CHUNK_SIZE = 2000 # approximate length in characters for each chunk being summarized
	TEMPERATURE = 0


	def load_transcript(input_file):
	"""Load the text from the transcript uploaded using the file uploader widget"""
	# transform file from bytes to string
	input_string = StringIO(input_file.getvalue().decode('UTF-8'))

	# Google Meet Transcripts have a header with info like the meeting title, date, and attendees
	# We'll want to extract this information separately, instead of having it passed to a summarizer

	file_text = input_string.readlines()

	header = file_text[:HEADER_SIZE]
	transcript = "".join(file_text[HEADER_SIZE:])

	return header, transcript


	def chunk_transcript(transcript: str):
	# Google Meet transcripts show the timestamp every 5 minutes
	# split the transcript on the 5-min timestamps
	timestamp_regex_pattern = r"[0-9]{2}:[0-9]{2}:0{2}"
	five_minute_chunks = re.split(timestamp_regex_pattern, transcript)

	# create a textsplitter to subdivide those chunks into appropriately sized chunks.
	text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE)

	# for each 5 minute chunk divide further into sub-chunks of appropriate length
	chunks = [text_splitter.split_text(five_minute_chunk) for five_minute_chunk in five_minute_chunks]

	# chunks, is a list of lists
	# outer list represents 5-minute sections of the meeting
	# inner lists representing the subdivisions of that sections that are small enough to be summarized thoroughly

	return chunks


	def summarize_chunks(five_minute_chunks, user_api_key, debug = False):
	"""Create summaries of each chunk of the transcript"""

	system_prompt = '''As a professional summarizer, create a concise and comprehensive summary of the provided conversation, while adhering to these guidelines:
	1. Craft a summary that is detailed, thorough, in-depth, and complex, while maintaining clarity and conciseness.
	2, Incorporate main ideas and essential information, eliminating extraneous language and focusing on critical aspects.
	3. Rely strictly on the provided text, without including external information.
	4. Format the summary in paragraph form for easy understanding.
	5. Do not start the response with "In this conversation", "During this conversation", "During the conversation" or a similar phrase
	'''

	total_chunks = sum([len(five_minute_chunk) for five_minute_chunk in five_minute_chunks])
	number_of_summarized_chunks = 0

	progress_bar = st.progress(number_of_summarized_chunks, f"Summarized {number_of_summarized_chunks}/{total_chunks} Chunks...")

	five_minute_summaries = []
	for sub_chunks in five_minute_chunks:
	summaries = []
	for chunk in sub_chunks:
	if not debug:
	messages = [
	{"role": "system", "content": system_prompt},
	{"role": "user", "content": chunk}
	]

	response = openai.ChatCompletion.create(
	model="gpt-3.5-turbo",
	messages=messages,
	temperature=TEMPERATURE,
	api_key=user_api_key
	)

	summary = response['choices'][0]['message']['content']
	else:
	summary = "I would be a meeting note :D"

	# update progress bar
	number_of_summarized_chunks += 1
	progress_bar.progress(number_of_summarized_chunks / total_chunks,
	f"Summarized {number_of_summarized_chunks}/{total_chunks} Chunks...")

	summaries.append(summary)

	five_minute_summaries.append(summaries)

	return five_minute_summaries


	def format_notes(big_summaries, header):
	"""Create a string containing the meeting notes in Markdown format"""
	# The header of Google Meet transcripts are always the same structure, so we can manually extract info from them
	first_line = re.split(r"[()]", header[0]) # the first line contains both the title and the date
	meeting_name = first_line[0]
	meeting_date = first_line[1]
	attendees = header[2]

	meeting_notes = f"# {meeting_name}\n{meeting_date}\n## Attendees\n{attendees}\n## Meeting Notes\n"

	for i, summaries in enumerate(big_summaries):
	timestamp = time.strftime('%H:%M:%S', time.gmtime(60 * 5 * i))

	meeting_notes += f"### {timestamp}\n"
	for summary in summaries:
	meeting_notes += f"- {summary.strip()}\n"

	return meeting_notes