Spaces:
Runtime error
Runtime error
# built in | |
from io import StringIO | |
import re | |
import time | |
# 3rd party - located in requirements.txt | |
import streamlit as st | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
import openai | |
HEADER_SIZE = 5 # number of lines in the transcript header | |
CHUNK_SIZE = 2000 # approximate length in characters for each chunk being summarized | |
TEMPERATURE = 0 | |
def load_transcript(input_file): | |
"""Load the text from the transcript uploaded using the file uploader widget""" | |
# transform file from bytes to string | |
input_string = StringIO(input_file.getvalue().decode('UTF-8')) | |
# Google Meet Transcripts have a header with info like the meeting title, date, and attendees | |
# We'll want to extract this information separately, instead of having it passed to a summarizer | |
file_text = input_string.readlines() | |
header = file_text[:HEADER_SIZE] | |
transcript = "".join(file_text[HEADER_SIZE:]) | |
return header, transcript | |
def chunk_transcript(transcript: str): | |
# Google Meet transcripts show the timestamp every 5 minutes | |
# split the transcript on the 5-min timestamps | |
timestamp_regex_pattern = r"[0-9]{2}:[0-9]{2}:0{2}" | |
five_minute_chunks = re.split(timestamp_regex_pattern, transcript) | |
# create a textsplitter to subdivide those chunks into appropriately sized chunks. | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE) | |
# for each 5 minute chunk divide further into sub-chunks of appropriate length | |
chunks = [text_splitter.split_text(five_minute_chunk) for five_minute_chunk in five_minute_chunks] | |
# chunks, is a list of lists | |
# outer list represents 5-minute sections of the meeting | |
# inner lists representing the subdivisions of that sections that are small enough to be summarized thoroughly | |
return chunks | |
def summarize_chunks(five_minute_chunks, user_api_key, debug = False): | |
"""Create summaries of each chunk of the transcript""" | |
system_prompt = '''As a professional summarizer, create a concise and comprehensive summary of the provided conversation, while adhering to these guidelines: | |
1. Craft a summary that is detailed, thorough, in-depth, and complex, while maintaining clarity and conciseness. | |
2, Incorporate main ideas and essential information, eliminating extraneous language and focusing on critical aspects. | |
3. Rely strictly on the provided text, without including external information. | |
4. Format the summary in paragraph form for easy understanding. | |
5. Do not start the response with "In this conversation", "During this conversation", "During the conversation" or a similar phrase | |
''' | |
total_chunks = sum([len(five_minute_chunk) for five_minute_chunk in five_minute_chunks]) | |
number_of_summarized_chunks = 0 | |
progress_bar = st.progress(number_of_summarized_chunks, f"Summarized {number_of_summarized_chunks}/{total_chunks} Chunks...") | |
five_minute_summaries = [] | |
for sub_chunks in five_minute_chunks: | |
summaries = [] | |
for chunk in sub_chunks: | |
if not debug: | |
messages = [ | |
{"role": "system", "content": system_prompt}, | |
{"role": "user", "content": chunk} | |
] | |
response = openai.ChatCompletion.create( | |
model="gpt-3.5-turbo", | |
messages=messages, | |
temperature=TEMPERATURE, | |
api_key=user_api_key | |
) | |
summary = response['choices'][0]['message']['content'] | |
else: | |
summary = "I would be a meeting note :D" | |
# update progress bar | |
number_of_summarized_chunks += 1 | |
progress_bar.progress(number_of_summarized_chunks / total_chunks, | |
f"Summarized {number_of_summarized_chunks}/{total_chunks} Chunks...") | |
summaries.append(summary) | |
five_minute_summaries.append(summaries) | |
return five_minute_summaries | |
def format_notes(big_summaries, header): | |
"""Create a string containing the meeting notes in Markdown format""" | |
# The header of Google Meet transcripts are always the same structure, so we can manually extract info from them | |
first_line = re.split(r"[()]", header[0]) # the first line contains both the title and the date | |
meeting_name = first_line[0] | |
meeting_date = first_line[1] | |
attendees = header[2] | |
meeting_notes = f"# {meeting_name}\n{meeting_date}\n## Attendees\n{attendees}\n## Meeting Notes\n" | |
for i, summaries in enumerate(big_summaries): | |
timestamp = time.strftime('%H:%M:%S', time.gmtime(60 * 5 * i)) | |
meeting_notes += f"### {timestamp}\n" | |
for summary in summaries: | |
meeting_notes += f"- {summary.strip()}\n" | |
return meeting_notes | |