msaelices commited on
Commit
8fe64f7
β€’
1 Parent(s): e59c4fa

Initial commit

Browse files
Files changed (6) hide show
  1. LICENSE +21 -0
  2. README.md +2 -13
  3. api.py +19 -0
  4. engines.py +90 -0
  5. main.py +82 -0
  6. requirements.txt +8 -0
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Manuel Saelices
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md CHANGED
@@ -1,13 +1,2 @@
1
- ---
2
- title: Note Taker
3
- emoji: 🐠
4
- colorFrom: green
5
- colorTo: purple
6
- sdk: streamlit
7
- sdk_version: 1.25.0
8
- app_file: app.py
9
- pinned: false
10
- license: mit
11
- ---
12
-
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
+ # note-taker
2
+ AI based UX for taking notes from an audio file, which speakers identification
 
 
 
 
 
 
 
 
 
 
 
api.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ from io import BytesIO
3
+
4
+ from engines import TranscriptEngine
5
+
6
+
7
+ def transcribe(engine: TranscriptEngine, language: str, audio_file: BytesIO) -> str:
8
+ return engine.transcribe(language, audio_file)
9
+
10
+
11
+ def summarize_transcript(
12
+ openai_api_key: str,
13
+ transcript: str,
14
+ openai_model: str = 'gpt-4',
15
+ prompt: str = 'Summarize the following audio transcription with a list of the key points with the speakers in the original language:',
16
+ ) -> str:
17
+ """Summarize the transcription using OpenAI's API"""
18
+ # TODO: Implement this
19
+ return 'This is a summary of the transcription.'
engines.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Protocol
2
+ from io import BytesIO
3
+
4
+ import requests
5
+
6
+ from google.cloud import speech_v2 as speech
7
+
8
+
9
+ class TranscriptEngine(Protocol):
10
+ """Protocol for a transcription engine"""
11
+
12
+ def transcribe(self, language, audio_file: bytes) -> str:
13
+ """transcribe audio file to text"""
14
+ ...
15
+
16
+
17
+ class AssemblyAI:
18
+ transcript = 'https://api.assemblyai.com/v2/transcript'
19
+ upload = 'https://api.assemblyai.com/v2/upload'
20
+
21
+ def __init__(self, api_key: str):
22
+ self.api_key = api_key
23
+
24
+ def transcribe(self, language, audio_file: BytesIO) -> str:
25
+ headers = {'authorization': self.api_key, 'content-type': 'application/json'}
26
+ upload_response = requests.post(
27
+ AssemblyAI.upload, headers=headers, data=audio_file
28
+ )
29
+
30
+ audio_url = upload_response.json()['upload_url']
31
+
32
+ json = {
33
+ 'audio_url': audio_url,
34
+ 'iab_categories': True,
35
+ 'language_code': language,
36
+ 'speaker_labels': True,
37
+ }
38
+
39
+ response = requests.post(AssemblyAI.transcript, json=json, headers=headers)
40
+
41
+ if not response.ok:
42
+ # TODO: Handle errors
43
+ return response.json()
44
+
45
+ polling_endpoint = f'{AssemblyAI.transcript}/{response.json()["id"]}'
46
+
47
+ status = 'submitted'
48
+ while status != 'completed':
49
+ polling_response = requests.get(polling_endpoint, headers=headers)
50
+ status = polling_response.json()['status']
51
+
52
+ # TODO: Remove this
53
+ print(polling_response.json())
54
+
55
+ # TODO: Return the speakers and their text
56
+ return polling_response.json()['text']
57
+
58
+
59
+ class GoogleCloud:
60
+ def __init__(self, api_key: str):
61
+ self.api_key = api_key
62
+
63
+ def transcribe(self, language, audio_file: BytesIO) -> str:
64
+ client = speech.SpeechClient()
65
+
66
+ audio = speech.RecognitionAudio(content=audio_file.read())
67
+
68
+ config = speech.RecognitionConfig(
69
+ encoding=speech.RecognitionConfig.AudioEncoding.ENCODING_UNSPECIFIED,
70
+ language_code=language,
71
+ diarization_config=speech.SpeakerDiarizationConfig(
72
+ enable_speaker_diarization=True,
73
+ ),
74
+ )
75
+
76
+ operation = client.long_running_recognize(config=config, audio=audio)
77
+ response = operation.result()
78
+
79
+ return ' '.join(
80
+ result.alternatives[0].transcript for result in response.results
81
+ )
82
+
83
+
84
+ def get_engine(engine_type: str, api_key: str | None) -> TranscriptEngine:
85
+ engine_cls = {
86
+ 'AssemblyAI': AssemblyAI,
87
+ 'Google': GoogleCloud,
88
+ }[engine_type]
89
+
90
+ return engine_cls(api_key)
main.py ADDED
@@ -0,0 +1,82 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import streamlit as st
3
+
4
+ from dotenv import load_dotenv
5
+ from engines import get_engine
6
+
7
+ import api
8
+
9
+ # Load environment variables from .env file before importing any other modules
10
+ load_dotenv()
11
+
12
+
13
+ def main():
14
+ st.set_page_config(
15
+ page_title="Note Taker",
16
+ page_icon="πŸŽ™οΈ",
17
+ layout="centered",
18
+ initial_sidebar_state="expanded",
19
+ )
20
+
21
+ title = "πŸŽ™οΈ Meetings Note Taker πŸŽ™οΈ"
22
+ st.title(title)
23
+ st.write(
24
+ "Upload an audio file, transcribe it using Assembly.AI, and sgenerate meeting notes using your selected model."
25
+ )
26
+
27
+ openai_api_key = os.environ.get("OPENAI_API_KEY") or st.text_input(
28
+ "Enter your OpenAI API key:", type="password"
29
+ )
30
+
31
+ engine_type = os.environ.get("TRANSCRIPTION_ENGINE") or st.selectbox(
32
+ "Select a transcription engine:", ["AssemblyAI", "Google"]
33
+ )
34
+ if engine_type in ["AssemblyAI"]:
35
+ engine_api_key = os.environ.get(
36
+ f"{engine_type.upper()}_API_KEY"
37
+ ) or st.text_input(f"Enter your {engine_type} API key:", type="password")
38
+ else:
39
+ engine_api_key = (
40
+ None # Google doesn't need an API key but uses a credentials file
41
+ )
42
+ openai_model = os.environ.get("OPENAI_MODEL") or st.selectbox(
43
+ "Select a model:", ["gpt-3.5-turbo", "gpt-3.5-turbo-16k", "gpt-4-0613"]
44
+ )
45
+
46
+ uploaded_audio = st.file_uploader(
47
+ "Upload an audio file",
48
+ type=["aac", "m4a", "mp3", "webm", "mp4", "mpga", "wav", "mpeg"],
49
+ accept_multiple_files=False,
50
+ )
51
+ language = os.environ.get("AUDIO_LANGUAGE") or st.selectbox(
52
+ "Language code of the audio:", ["en", "es"]
53
+ )
54
+
55
+ if st.button("Generate Notes"):
56
+ if uploaded_audio:
57
+ if openai_api_key:
58
+ st.markdown("Transcribing the audio...")
59
+ engine = get_engine(engine_type, engine_api_key)
60
+ transcription = api.transcribe(engine, language, uploaded_audio)
61
+
62
+ st.markdown(
63
+ f"### Transcription:\n\n<details><summary>Click to view</summary><p><pre><code>{transcription}</code></pre></p></details>",
64
+ unsafe_allow_html=True,
65
+ )
66
+
67
+ st.markdown("Summarizing the transcription...")
68
+
69
+ summary = api.summarize_transcript(
70
+ openai_api_key,
71
+ transcription,
72
+ openai_model,
73
+ )
74
+
75
+ st.markdown(f"### Summary:")
76
+ st.write(summary)
77
+ else:
78
+ st.error("We need valid OpenAI and AssemblyAI API keys")
79
+
80
+
81
+ if __name__ == "__main__":
82
+ main()
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ requests>=2.31.0
2
+ streamlit>=1.25.0
3
+ python-dotenv>=1.0.0
4
+ google_cloud_speech>=2.21.0
5
+ torch==2.0.0+cu117
6
+ torchvision==0.15.1+cu117
7
+ torchaudio==2.0.1 --index-url https://download.pytorch.org/whl/cu117
8
+ git+https://github.com/m-bain/whisperx.git