Eric Botti commited on
Commit
9b23edc
1 Parent(s): e5d260a

updated to use gpt-3.5-turbo, improved streamlit interface

Browse files
Files changed (6) hide show
  1. .gitignore +1 -2
  2. app.py +38 -13
  3. main.py +0 -105
  4. requirements.txt +0 -0
  5. setup.py +0 -26
  6. summarizer.py +116 -0
.gitignore CHANGED
@@ -1,5 +1,4 @@
1
  venv
2
  transcript.txt
3
  notes.txt
4
- notes.md
5
- config.ini
 
1
  venv
2
  transcript.txt
3
  notes.txt
4
+ notes.md
 
app.py CHANGED
@@ -1,24 +1,49 @@
1
- # standard
2
- from io import StringIO
3
- # 3rd party
4
  import streamlit as st
5
  # local
6
- import main
7
 
8
  st.set_page_config(page_title='Transcript Notetaker', page_icon=':memo:', layout='wide')
9
 
10
- st.write("Hello World")
 
 
11
 
12
- upload = st.file_uploader("Transcript", type='.txt')
13
 
14
- take_notes = st.button("Create Notes")
 
 
15
 
16
- if take_notes and upload:
17
- upload_stringio = StringIO(upload.getvalue().decode('UTF-8'))
18
 
19
- notes = main.create_meeting_notes(upload_stringio)
20
 
21
- if notes:
22
- st.download_button("Download Notes", notes, "notes.md")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
- st.markdown(notes)
 
1
+ # 3rd party - located in requirements.txt
 
 
2
  import streamlit as st
3
  # local
4
+ import summarizer
5
 
6
  st.set_page_config(page_title='Transcript Notetaker', page_icon=':memo:', layout='wide')
7
 
8
+ # App Content
9
+ '''
10
+ # Transcript Notetaker
11
 
12
+ Upload a transcript of a Google Meet call and this app will use the OpenAI API to generate detailed notes for the meeting.
13
 
14
+ _This program was designed to work with the transcript documents automatically generated by Google Meet meetings, using
15
+ transcripts with a different format may result in unexpected behavior._
16
+ '''
17
 
18
+ api_key = st.text_input("Enter your OpenAI API key", type='password')
 
19
 
20
+ uploaded_file = st.file_uploader('Upload your Transcript', type='.txt')
21
 
22
+ if api_key and uploaded_file:
23
+ create_notes_button_disabled = False
24
+ create_notes_button_help = ''
25
+ else:
26
+ create_notes_button_disabled = True
27
+ create_notes_button_help = "Enter your API key and upload a file to continue"
28
+
29
+ button_create_notes = st.button("Create Notes", disabled=create_notes_button_disabled, help=create_notes_button_help)
30
+
31
+ meeting_notes = None
32
+
33
+ if button_create_notes:
34
+
35
+ header, transcript = summarizer.load_transcript(uploaded_file)
36
+
37
+ chunks = summarizer.chunk_transcript(transcript)
38
+
39
+ summaries = summarizer.summarize_chunks(chunks, api_key)
40
+
41
+ meeting_notes = summarizer.format_notes(summaries, header)
42
+
43
+ if meeting_notes:
44
+ st.divider()
45
+
46
+ st.download_button("Download Notes", meeting_notes, "notes.md")
47
+
48
+ st.markdown(meeting_notes)
49
 
 
main.py DELETED
@@ -1,105 +0,0 @@
1
- # standard
2
- import configparser
3
- import os
4
- import time
5
- import re
6
- # 3rd party
7
- from langchain.llms import OpenAI
8
- from langchain.chat_models import ChatOpenAI
9
- from langchain import LLMChain
10
- from langchain.text_splitter import RecursiveCharacterTextSplitter
11
- from langchain import PromptTemplate
12
-
13
- # read config
14
- config = configparser.ConfigParser()
15
- config.read('config.ini')
16
-
17
- # read config variables
18
- if not os.getenv("OPENAI_API_KEY"):
19
- os.environ["OPENAI_API_KEY"] = config['REQUIRED']['openai-api-key']
20
-
21
- # LangChain Config
22
- # llm
23
- llm = OpenAI(temperature=0)
24
- # prompt
25
- prompt = PromptTemplate(
26
- template="Write a concise summary of the following: {transcript}",
27
- input_variables=['transcript']
28
- )
29
- # chain
30
- chain = LLMChain(
31
- prompt=prompt,
32
- llm=llm,
33
- verbose=False
34
- )
35
-
36
-
37
- def load_transcript(input_file):
38
- # Google Meet Transcripts have a header which we don't want to be summarized
39
- header_lines = 5
40
-
41
- file_text = input_file.readlines()
42
-
43
- head = file_text[:header_lines]
44
- transcript = "".join(file_text[header_lines:])
45
-
46
- return head, transcript
47
-
48
-
49
- def create_meeting_notes(transcript_file):
50
- # read config variables
51
- # if not os.getenv("OPENAI_API_KEY"):
52
- # os.environ["OPENAI_API_KEY"] = config['REQUIRED']['openai-api-key']
53
- # transcript_filepath = config['OPTIONAL']['transcript-filepath']
54
- # notes_filepath = config['OPTIONAL']['notes-filepath']
55
-
56
- head, transcript = load_transcript(transcript_file)
57
-
58
- # split the transcript on the 5-min timestamps
59
- regex_pattern = r"[0-9]{2}:[0-9]{2}:0{2}"
60
- five_min_chunks = re.split(regex_pattern, transcript)
61
-
62
- # create a textsplitter to subdivide those chunks into appropriately sized chunks.
63
- text_splitter = RecursiveCharacterTextSplitter(chunk_size=2000, chunk_overlap=0)
64
-
65
- # list the meeting time and the chunks associated with it
66
- timestamped_summaries = []
67
-
68
- print(f"Summarizing {len(five_min_chunks)*5} minute meeting")
69
- start_time = time.time()
70
- # summarize the
71
- for i, five_minutes_chunk in enumerate(five_min_chunks):
72
- timestamp = time.strftime('%H:%M:%S', time.gmtime(60 * 5 * i))
73
- sub_chunks = text_splitter.split_text(five_minutes_chunk)
74
-
75
- summaries = []
76
- for j, chunk in enumerate(sub_chunks, 1):
77
- summaries.append(chain.run(chunk))
78
- print(f"{timestamp}: Chunk {j}/{len(sub_chunks)}")
79
-
80
- timestamped_summaries.append((timestamp, summaries))
81
-
82
- elapsed_time = time.time() - start_time
83
- minutes = elapsed_time // 60
84
- print(f"Summarized first {5 * (i+1)} minutes of meeting, {minutes:.0f} minutes {elapsed_time - 60 * minutes:.2f} seconds elapsed")
85
-
86
- first_line = re.split(r"[()]", head[0])
87
-
88
- # Transcript Notes
89
- meeting_notes = f'''# {first_line[0]}
90
- {first_line[1]}
91
- ## Attendees
92
- {head[2]}## Meeting Notes
93
- '''
94
- for timestamp, summaries in timestamped_summaries:
95
- meeting_notes += f'### {timestamp}\n'
96
- for summary in summaries:
97
- meeting_notes += f"- {summary.strip()}\n"
98
- meeting_notes += "\nEnd of Meeting"
99
-
100
- return meeting_notes
101
-
102
- # with open(notes_filepath, 'w+') as f:
103
- # f.write(meeting_notes)
104
-
105
- # print(f"Export to file {notes_filepath} completed")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
 
setup.py DELETED
@@ -1,26 +0,0 @@
1
- """
2
- Run this script first to install requirements.txt and create config file
3
- """
4
- import configparser
5
- import sys
6
- import subprocess
7
-
8
- # install requirements.txt
9
- subprocess.check_call([sys.executable, '-m', 'pip', 'install', '-r', 'requirements.txt'])
10
-
11
- # create default config file
12
- config = configparser.ConfigParser()
13
-
14
- # Required
15
- config['REQUIRED'] = {
16
- "openai-api-key": "Replace this with your key"
17
- }
18
-
19
- # Optional
20
- config['OPTIONAL'] = {
21
- 'transcript-filepath': 'transcript.txt',
22
- 'notes-filepath': 'notes.md'
23
- }
24
-
25
- with open('config.ini', 'w') as configfile:
26
- config.write(configfile)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
summarizer.py ADDED
@@ -0,0 +1,116 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # built in
2
+ from io import StringIO
3
+ import re
4
+ import time
5
+ # 3rd party - located in requirements.txt
6
+ import streamlit as st
7
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
8
+ import openai
9
+
10
+ HEADER_SIZE = 5 # number of lines in the transcript header
11
+ CHUNK_SIZE = 2000 # approximate length in characters for each chunk being summarized
12
+ TEMPERATURE = 0
13
+
14
+
15
+ def load_transcript(input_file):
16
+ """Load the text from the transcript uploaded using the file uploader widget"""
17
+ # transform file from bytes to string
18
+ input_string = StringIO(input_file.getvalue().decode('UTF-8'))
19
+
20
+ # Google Meet Transcripts have a header with info like the meeting title, date, and attendees
21
+ # We'll want to extract this information separately, instead of having it passed to a summarizer
22
+
23
+ file_text = input_string.readlines()
24
+
25
+ header = file_text[:HEADER_SIZE]
26
+ transcript = "".join(file_text[HEADER_SIZE:])
27
+
28
+ return header, transcript
29
+
30
+
31
+ def chunk_transcript(transcript: str):
32
+ # Google Meet transcripts show the timestamp every 5 minutes
33
+ # split the transcript on the 5-min timestamps
34
+ timestamp_regex_pattern = r"[0-9]{2}:[0-9]{2}:0{2}"
35
+ five_minute_chunks = re.split(timestamp_regex_pattern, transcript)
36
+
37
+ # create a textsplitter to subdivide those chunks into appropriately sized chunks.
38
+ text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE)
39
+
40
+ # for each 5 minute chunk divide further into sub-chunks of appropriate length
41
+ chunks = [text_splitter.split_text(five_minute_chunk) for five_minute_chunk in five_minute_chunks]
42
+
43
+ # chunks, is a list of lists
44
+ # outer list represents 5-minute sections of the meeting
45
+ # inner lists representing the subdivisions of that sections that are small enough to be summarized thoroughly
46
+
47
+ return chunks
48
+
49
+
50
+ def summarize_chunks(five_minute_chunks, user_api_key, debug = False):
51
+ """Create summaries of each chunk of the transcript"""
52
+
53
+ system_prompt = '''As a professional summarizer, create a concise and comprehensive summary of the provided conversation, while adhering to these guidelines:
54
+ 1. Craft a summary that is detailed, thorough, in-depth, and complex, while maintaining clarity and conciseness.
55
+ 2, Incorporate main ideas and essential information, eliminating extraneous language and focusing on critical aspects.
56
+ 3. Rely strictly on the provided text, without including external information.
57
+ 4. Format the summary in paragraph form for easy understanding.
58
+ 5. Do not start the response with "In this conversation", "During this conversation", "During the conversation" or a similar phrase
59
+ '''
60
+
61
+ total_chunks = sum([len(five_minute_chunk) for five_minute_chunk in five_minute_chunks])
62
+ number_of_summarized_chunks = 0
63
+
64
+ progress_bar = st.progress(number_of_summarized_chunks, f"Summarized {number_of_summarized_chunks}/{total_chunks} Chunks...")
65
+
66
+ five_minute_summaries = []
67
+ for sub_chunks in five_minute_chunks:
68
+ summaries = []
69
+ for chunk in sub_chunks:
70
+ if not debug:
71
+ messages = [
72
+ {"role": "system", "content": system_prompt},
73
+ {"role": "user", "content": chunk}
74
+ ]
75
+
76
+ response = openai.ChatCompletion.create(
77
+ model="gpt-3.5-turbo",
78
+ messages=messages,
79
+ temperature=TEMPERATURE,
80
+ api_key=user_api_key
81
+ )
82
+
83
+ summary = response['choices'][0]['message']['content']
84
+ else:
85
+ summary = "I would be a meeting note :D"
86
+
87
+ # update progress bar
88
+ number_of_summarized_chunks += 1
89
+ progress_bar.progress(number_of_summarized_chunks / total_chunks,
90
+ f"Summarized {number_of_summarized_chunks}/{total_chunks} Chunks...")
91
+
92
+ summaries.append(summary)
93
+
94
+ five_minute_summaries.append(summaries)
95
+
96
+ return five_minute_summaries
97
+
98
+
99
+ def format_notes(big_summaries, header):
100
+ """Create a string containing the meeting notes in Markdown format"""
101
+ # The header of Google Meet transcripts are always the same structure, so we can manually extract info from them
102
+ first_line = re.split(r"[()]", header[0]) # the first line contains both the title and the date
103
+ meeting_name = first_line[0]
104
+ meeting_date = first_line[1]
105
+ attendees = header[2]
106
+
107
+ meeting_notes = f"# {meeting_name}\n{meeting_date}\n## Attendees\n{attendees}\n## Meeting Notes\n"
108
+
109
+ for i, summaries in enumerate(big_summaries):
110
+ timestamp = time.strftime('%H:%M:%S', time.gmtime(60 * 5 * i))
111
+
112
+ meeting_notes += f"### {timestamp}\n"
113
+ for summary in summaries:
114
+ meeting_notes += f"- {summary.strip()}\n"
115
+
116
+ return meeting_notes