oceansweep commited on
Commit
7b9da4a
1 Parent(s): 0b53f31

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +1436 -0
app.py ADDED
@@ -0,0 +1,1436 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ import gradio as gr
3
+ import argparse, configparser, datetime, json, logging, os, platform, requests, shutil, subprocess, sys, time, unicodedata
4
+ import zipfile
5
+ from datetime import datetime
6
+ import contextlib
7
+ import ffmpeg
8
+ import torch
9
+ import yt_dlp
10
+
11
+
12
+ #######
13
+ # Function Sections
14
+ #
15
+ # System Checks
16
+ # Processing Paths and local file handling
17
+ # Video Download/Handling
18
+ # Audio Transcription
19
+ # Diarization
20
+ # Summarizers
21
+ # Main
22
+ #
23
+ #######
24
+
25
+ # To Do
26
+ # Offline diarization - https://github.com/pyannote/pyannote-audio/blob/develop/tutorials/community/offline_usage_speaker_diarization.ipynb
27
+
28
+
29
+ ####
30
+ #
31
+ # TL/DW: Too Long Didn't Watch
32
+ #
33
+ # Project originally created by https://github.com/the-crypt-keeper
34
+ # Modifications made by https://github.com/rmusser01
35
+ # All credit to the original authors, I've just glued shit together.
36
+ #
37
+ #
38
+ # Usage:
39
+ # Transcribe a single URL:
40
+ # python diarize.py https://example.com/video.mp4
41
+ #
42
+ # Transcribe a single URL and have the resulting transcription summarized:
43
+ # python diarize.py https://example.com/video.mp4
44
+ #
45
+ # Transcribe a list of files:
46
+ # python diarize.py ./path/to/your/text_file.txt
47
+ #
48
+ # Transcribe a local file:
49
+ # python diarize.py /path/to/your/localfile.mp4
50
+ #
51
+ # Transcribe a local file and have it summarized:
52
+ # python diarize.py ./input.mp4 --api_name openai --api_key <your_openai_api_key>
53
+ #
54
+ # Transcribe a list of files and have them all summarized:
55
+ # python diarize.py path_to_your_text_file.txt --api_name <openai> --api_key <your_openai_api_key>
56
+ #
57
+ ###
58
+
59
+
60
+ #######################
61
+ # Config loading
62
+ #
63
+
64
+ # Read configuration from file
65
+ config = configparser.ConfigParser()
66
+ config.read('config.txt')
67
+
68
+ # API Keys
69
+ anthropic_api_key = config.get('API', 'anthropic_api_key', fallback=None)
70
+ cohere_api_key = config.get('API', 'cohere_api_key', fallback=None)
71
+ groq_api_key = config.get('API', 'groq_api_key', fallback=None)
72
+ openai_api_key = config.get('API', 'openai_api_key', fallback=None)
73
+ huggingface_api_key = config.get('API', 'huggingface_api_key', fallback=None)
74
+
75
+ # Models
76
+ anthropic_model = config.get('API', 'anthropic_model', fallback='claude-3-sonnet-20240229')
77
+ cohere_model = config.get('API', 'cohere_model', fallback='command-r-plus')
78
+ groq_model = config.get('API', 'groq_model', fallback='FIXME')
79
+ openai_model = config.get('API', 'openai_model', fallback='gpt-4-turbo')
80
+ huggingface_model = config.get('API', 'huggingface_model', fallback='microsoft/Phi-3-mini-128k-instruct')
81
+
82
+ # Local-Models
83
+ kobold_api_IP = config.get('Local-API', 'kobold_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
84
+ kobold_api_key = config.get('Local-API', 'kobold_api_key', fallback='')
85
+ llama_api_IP = config.get('Local-API', 'llama_api_IP', fallback='http://127.0.0.1:8080/v1/chat/completions')
86
+ llama_api_key = config.get('Local-API', 'llama_api_key', fallback='')
87
+ ooba_api_IP = config.get('Local-API', 'ooba_api_IP', fallback='http://127.0.0.1:5000/v1/chat/completions')
88
+ ooba_api_key = config.get('Local-API', 'ooba_api_key', fallback='')
89
+
90
+ # Retrieve output paths from the configuration file
91
+ output_path = config.get('Paths', 'output_path', fallback='results')
92
+
93
+ # Retrieve processing choice from the configuration file
94
+ processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
95
+
96
+ # Log file
97
+ #logging.basicConfig(filename='debug-runtime.log', encoding='utf-8', level=logging.DEBUG)
98
+
99
+ #
100
+ #
101
+ #######################
102
+
103
+ # Dirty hack - sue me.
104
+ os.environ['KMP_DUPLICATE_LIB_OK']='True'
105
+
106
+ whisper_models = ["small", "medium", "small.en","medium.en"]
107
+ source_languages = {
108
+ "en": "English",
109
+ "zh": "Chinese",
110
+ "de": "German",
111
+ "es": "Spanish",
112
+ "ru": "Russian",
113
+ "ko": "Korean",
114
+ "fr": "French"
115
+ }
116
+ source_language_list = [key[0] for key in source_languages.items()]
117
+
118
+
119
+
120
+
121
+ print(r"""_____ _ ________ _ _
122
+ |_ _|| | / /| _ \| | | | _
123
+ | | | | / / | | | || | | |(_)
124
+ | | | | / / | | | || |/\| |
125
+ | | | |____ / / | |/ / \ /\ / _
126
+ \_/ \_____//_/ |___/ \/ \/ (_)
127
+
128
+
129
+ _ _
130
+ | | | |
131
+ | |_ ___ ___ | | ___ _ __ __ _
132
+ | __| / _ \ / _ \ | | / _ \ | '_ \ / _` |
133
+ | |_ | (_) || (_) | | || (_) || | | || (_| | _
134
+ \__| \___/ \___/ |_| \___/ |_| |_| \__, |( )
135
+ __/ ||/
136
+ |___/
137
+ _ _ _ _ _ _ _
138
+ | |(_) | | ( )| | | | | |
139
+ __| | _ __| | _ __ |/ | |_ __ __ __ _ | |_ ___ | |__
140
+ / _` || | / _` || '_ \ | __| \ \ /\ / / / _` || __| / __|| '_ \
141
+ | (_| || || (_| || | | | | |_ \ V V / | (_| || |_ | (__ | | | |
142
+ \__,_||_| \__,_||_| |_| \__| \_/\_/ \__,_| \__| \___||_| |_|
143
+ """)
144
+
145
+ ####################################################################################################################################
146
+ # System Checks
147
+ #
148
+ #
149
+
150
+ # Perform Platform Check
151
+ userOS = ""
152
+ def platform_check():
153
+ global userOS
154
+ if platform.system() == "Linux":
155
+ print("Linux OS detected \n Running Linux appropriate commands")
156
+ userOS = "Linux"
157
+ elif platform.system() == "Windows":
158
+ print("Windows OS detected \n Running Windows appropriate commands")
159
+ userOS = "Windows"
160
+ else:
161
+ print("Other OS detected \n Maybe try running things manually?")
162
+ exit()
163
+
164
+
165
+
166
+ # Check for NVIDIA GPU and CUDA availability
167
+ def cuda_check():
168
+ global processing_choice
169
+ try:
170
+ nvidia_smi = subprocess.check_output("nvidia-smi", shell=True).decode()
171
+ if "NVIDIA-SMI" in nvidia_smi:
172
+ print("NVIDIA GPU with CUDA is available.")
173
+ processing_choice = "cuda" # Set processing_choice to gpu if NVIDIA GPU with CUDA is available
174
+ else:
175
+ print("NVIDIA GPU with CUDA is not available.\nYou either have an AMD GPU, or you're stuck with CPU only.")
176
+ processing_choice = "cpu" # Set processing_choice to cpu if NVIDIA GPU with CUDA is not available
177
+ except subprocess.CalledProcessError:
178
+ print("NVIDIA GPU with CUDA is not available.\nYou either have an AMD GPU, or you're stuck with CPU only.")
179
+ processing_choice = "cpu" # Set processing_choice to cpu if nvidia-smi command fails
180
+
181
+
182
+
183
+ # Ask user if they would like to use either their GPU or their CPU for transcription
184
+ def decide_cpugpu():
185
+ global processing_choice
186
+ processing_input = input("Would you like to use your GPU or CPU for transcription? (1/cuda)GPU/(2/cpu)CPU): ")
187
+ if processing_choice == "cuda" and (processing_input.lower() == "cuda" or processing_input == "1"):
188
+ print("You've chosen to use the GPU.")
189
+ logging.debug("GPU is being used for processing")
190
+ processing_choice = "cuda"
191
+ elif processing_input.lower() == "cpu" or processing_input == "2":
192
+ print("You've chosen to use the CPU.")
193
+ logging.debug("CPU is being used for processing")
194
+ processing_choice = "cpu"
195
+ else:
196
+ print("Invalid choice. Please select either GPU or CPU.")
197
+
198
+
199
+
200
+ # check for existence of ffmpeg
201
+ def check_ffmpeg():
202
+ if shutil.which("ffmpeg") or (os.path.exists("Bin") and os.path.isfile(".\\Bin\\ffmpeg.exe")):
203
+ logging.debug("ffmpeg found installed on the local system, in the local PATH, or in the './Bin' folder")
204
+ pass
205
+ else:
206
+ logging.debug("ffmpeg not installed on the local system/in local PATH")
207
+ print("ffmpeg is not installed.\n\n You can either install it manually, or through your package manager of choice.\n Windows users, builds are here: https://www.gyan.dev/ffmpeg/builds/")
208
+ if userOS == "Windows":
209
+ download_ffmpeg()
210
+ elif userOS == "Linux":
211
+ print("You should install ffmpeg using your platform's appropriate package manager, 'apt install ffmpeg','dnf install ffmpeg' or 'pacman', etc.")
212
+ else:
213
+ logging.debug("running an unsupported OS")
214
+ print("You're running an unspported/Un-tested OS")
215
+ exit_script = input("Let's exit the script, unless you're feeling lucky? (y/n)")
216
+ if exit_script == "y" or "yes" or "1":
217
+ exit()
218
+
219
+
220
+
221
+ # Download ffmpeg
222
+ def download_ffmpeg():
223
+ user_choice = input("Do you want to download ffmpeg? (y)Yes/(n)No: ")
224
+ if user_choice.lower() == 'yes' or 'y' or '1':
225
+ print("Downloading ffmpeg")
226
+ url = "https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-essentials.zip"
227
+ response = requests.get(url)
228
+
229
+ if response.status_code == 200:
230
+ print("Saving ffmpeg zip file")
231
+ logging.debug("Saving ffmpeg zip file")
232
+ zip_path = "ffmpeg-release-essentials.zip"
233
+ with open(zip_path, 'wb') as file:
234
+ file.write(response.content)
235
+
236
+ logging.debug("Extracting the 'ffmpeg.exe' file from the zip")
237
+ print("Extracting ffmpeg.exe from zip file to '/Bin' folder")
238
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
239
+ ffmpeg_path = "ffmpeg-7.0-essentials_build/bin/ffmpeg.exe"
240
+
241
+ logging.debug("checking if the './Bin' folder exists, creating if not")
242
+ bin_folder = "Bin"
243
+ if not os.path.exists(bin_folder):
244
+ logging.debug("Creating a folder for './Bin', it didn't previously exist")
245
+ os.makedirs(bin_folder)
246
+
247
+ logging.debug("Extracting 'ffmpeg.exe' to the './Bin' folder")
248
+ zip_ref.extract(ffmpeg_path, path=bin_folder)
249
+
250
+ logging.debug("Moving 'ffmpeg.exe' to the './Bin' folder")
251
+ src_path = os.path.join(bin_folder, ffmpeg_path)
252
+ dst_path = os.path.join(bin_folder, "ffmpeg.exe")
253
+ shutil.move(src_path, dst_path)
254
+
255
+ logging.debug("Removing ffmpeg zip file")
256
+ print("Deleting zip file (we've already extracted ffmpeg.exe, no worries)")
257
+ os.remove(zip_path)
258
+
259
+ logging.debug("ffmpeg.exe has been downloaded and extracted to the './Bin' folder.")
260
+ print("ffmpeg.exe has been successfully downloaded and extracted to the './Bin' folder.")
261
+ else:
262
+ logging.error("Failed to download the zip file.")
263
+ print("Failed to download the zip file.")
264
+ else:
265
+ logging.debug("User chose to not download ffmpeg")
266
+ print("ffmpeg will not be downloaded.")
267
+
268
+ #
269
+ #
270
+ ####################################################################################################################################
271
+
272
+
273
+
274
+
275
+
276
+
277
+
278
+ ####################################################################################################################################
279
+ # Processing Paths and local file handling
280
+ #
281
+ #
282
+
283
+ def read_paths_from_file(file_path):
284
+ """ Reads a file containing URLs or local file paths and returns them as a list. """
285
+ paths = [] # Initialize paths as an empty list
286
+ with open(file_path, 'r') as file:
287
+ for line in file:
288
+ line = line.strip()
289
+ if line and not os.path.exists(os.path.join('results', normalize_title(line.split('/')[-1].split('.')[0]) + '.json')):
290
+ logging.debug("line successfully imported from file and added to list to be transcribed")
291
+ paths.append(line)
292
+ return paths
293
+
294
+
295
+
296
+ def process_path(path):
297
+ """ Decides whether the path is a URL or a local file and processes accordingly. """
298
+ if path.startswith('http'):
299
+ logging.debug("file is a URL")
300
+ return get_youtube(path) # For YouTube URLs, modify to download and extract info
301
+ elif os.path.exists(path):
302
+ logging.debug("File is a path")
303
+ return process_local_file(path) # For local files, define a function to handle them
304
+ else:
305
+ logging.error(f"Path does not exist: {path}")
306
+ return None
307
+
308
+
309
+
310
+ # FIXME
311
+ def process_local_file(file_path):
312
+ logging.info(f"Processing local file: {file_path}")
313
+ title = normalize_title(os.path.splitext(os.path.basename(file_path))[0])
314
+ info_dict = {'title': title}
315
+ logging.debug(f"Creating {title} directory...")
316
+ download_path = create_download_directory(title)
317
+ logging.debug(f"Converting '{title}' to an audio file (wav).")
318
+ audio_file = convert_to_wav(file_path) # Assumes input files are videos needing audio extraction
319
+ logging.debug(f"'{title}' succesfully converted to an audio file (wav).")
320
+ return download_path, info_dict, audio_file
321
+ #
322
+ #
323
+ ####################################################################################################################################
324
+
325
+
326
+
327
+
328
+
329
+
330
+ ####################################################################################################################################
331
+ # Video Download/Handling
332
+ #
333
+
334
+ def process_url(input_path, num_speakers=2, whisper_model="small.en", offset=0, api_name=None, api_key=None, vad_filter=False, download_video_flag=False, demo_mode=False):
335
+ if demo_mode:
336
+ api_name = "huggingface"
337
+ api_key = os.environ.get("HF_TOKEN")
338
+ vad_filter = False
339
+ download_video_flag = False
340
+
341
+ try:
342
+ results = main(input_path, api_name=api_name, api_key=api_key, num_speakers=num_speakers, whisper_model=whisper_model, offset=offset, vad_filter=vad_filter, download_video_flag=download_video_flag)
343
+
344
+ if results:
345
+ transcription_result = results[0]
346
+ json_file_path = transcription_result['audio_file'].replace('.wav', '.segments.json')
347
+ with open(json_file_path, 'r') as file:
348
+ json_data = json.load(file)
349
+
350
+ summary_file_path = json_file_path.replace('.segments.json', '_summary.txt')
351
+ if os.path.exists(summary_file_path):
352
+ return json_data, summary_file_path, json_file_path, summary_file_path
353
+ else:
354
+ return json_data, "Summary not available.", json_file_path, None
355
+ else:
356
+ return None, "No results found.", None, None
357
+ except Exception as e:
358
+ error_message = f"An error occurred: {str(e)}"
359
+ return None, error_message, None, None
360
+
361
+
362
+
363
+ def create_download_directory(title):
364
+ base_dir = "Results"
365
+ # Remove characters that are illegal in Windows filenames and normalize
366
+ safe_title = normalize_title(title)
367
+ logging.debug(f"{title} successfully normalized")
368
+ session_path = os.path.join(base_dir, safe_title)
369
+ if not os.path.exists(session_path):
370
+ os.makedirs(session_path, exist_ok=True)
371
+ logging.debug(f"Created directory for downloaded video: {session_path}")
372
+ else:
373
+ logging.debug(f"Directory already exists for downloaded video: {session_path}")
374
+ return session_path
375
+
376
+
377
+
378
+ def normalize_title(title):
379
+ # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
380
+ title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
381
+ title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '').replace('*', '').replace('?', '').replace('<', '').replace('>', '').replace('|', '')
382
+ return title
383
+
384
+
385
+
386
+ def get_youtube(video_url):
387
+ ydl_opts = {
388
+ 'format': 'bestaudio[ext=m4a]',
389
+ 'noplaylist': False,
390
+ 'quiet': True,
391
+ 'extract_flat': True
392
+ }
393
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
394
+ logging.debug("About to extract youtube info")
395
+ info_dict = ydl.extract_info(video_url, download=False)
396
+ logging.debug("Youtube info successfully extracted")
397
+ return info_dict
398
+
399
+
400
+
401
+ def get_playlist_videos(playlist_url):
402
+ ydl_opts = {
403
+ 'extract_flat': True,
404
+ 'skip_download': True,
405
+ 'quiet': True
406
+ }
407
+
408
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
409
+ info = ydl.extract_info(playlist_url, download=False)
410
+
411
+ if 'entries' in info:
412
+ video_urls = [entry['url'] for entry in info['entries']]
413
+ playlist_title = info['title']
414
+ return video_urls, playlist_title
415
+ else:
416
+ print("No videos found in the playlist.")
417
+ return [], None
418
+
419
+
420
+
421
+ def save_to_file(video_urls, filename):
422
+ with open(filename, 'w') as file:
423
+ file.write('\n'.join(video_urls))
424
+ print(f"Video URLs saved to {filename}")
425
+
426
+
427
+
428
+ def download_video(video_url, download_path, info_dict, download_video_flag):
429
+ logging.debug("About to normalize downloaded video title")
430
+ title = normalize_title(info_dict['title'])
431
+
432
+ if download_video_flag == False:
433
+ file_path = os.path.join(download_path, f"{title}.m4a")
434
+ ydl_opts = {
435
+ 'format': 'bestaudio[ext=m4a]',
436
+ 'outtmpl': file_path,
437
+ }
438
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
439
+ logging.debug("yt_dlp: About to download audio with youtube-dl")
440
+ ydl.download([video_url])
441
+ logging.debug("yt_dlp: Audio successfully downloaded with youtube-dl")
442
+ return file_path
443
+ else:
444
+ video_file_path = os.path.join(download_path, f"{title}_video.mp4")
445
+ audio_file_path = os.path.join(download_path, f"{title}_audio.m4a")
446
+ ydl_opts_video = {
447
+ 'format': 'bestvideo[ext=mp4]',
448
+ 'outtmpl': video_file_path,
449
+ }
450
+ ydl_opts_audio = {
451
+ 'format': 'bestaudio[ext=m4a]',
452
+ 'outtmpl': audio_file_path,
453
+ }
454
+
455
+ with yt_dlp.YoutubeDL(ydl_opts_video) as ydl:
456
+ logging.debug("yt_dlp: About to download video with youtube-dl")
457
+ ydl.download([video_url])
458
+ logging.debug("yt_dlp: Video successfully downloaded with youtube-dl")
459
+
460
+ with yt_dlp.YoutubeDL(ydl_opts_audio) as ydl:
461
+ logging.debug("yt_dlp: About to download audio with youtube-dl")
462
+ ydl.download([video_url])
463
+ logging.debug("yt_dlp: Audio successfully downloaded with youtube-dl")
464
+
465
+ output_file_path = os.path.join(download_path, f"{title}.mp4")
466
+
467
+ if userOS == "Windows":
468
+ logging.debug("Running ffmpeg on Windows...")
469
+ ffmpeg_command = [
470
+ '.\\Bin\\ffmpeg.exe',
471
+ '-i', video_file_path,
472
+ '-i', audio_file_path,
473
+ '-c:v', 'copy',
474
+ '-c:a', 'copy',
475
+ output_file_path
476
+ ]
477
+ subprocess.run(ffmpeg_command, check=True)
478
+ elif userOS == "Linux":
479
+ logging.debug("Running ffmpeg on Linux...")
480
+ ffmpeg_command = [
481
+ 'ffmpeg',
482
+ '-i', video_file_path,
483
+ '-i', audio_file_path,
484
+ '-c:v', 'copy',
485
+ '-c:a', 'copy',
486
+ output_file_path
487
+ ]
488
+ subprocess.run(ffmpeg_command, check=True)
489
+ else:
490
+ logging.error("You shouldn't be here...")
491
+ exit()
492
+ os.remove(video_file_path)
493
+ os.remove(audio_file_path)
494
+
495
+ return output_file_path
496
+
497
+
498
+
499
+
500
+
501
+ #
502
+ #
503
+ ####################################################################################################################################
504
+
505
+
506
+
507
+
508
+
509
+
510
+ ####################################################################################################################################
511
+ # Audio Transcription
512
+ #
513
+ # Convert video .m4a into .wav using ffmpeg
514
+ # ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
515
+ # https://www.gyan.dev/ffmpeg/builds/
516
+ #
517
+
518
+ #os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
519
+ def convert_to_wav(video_file_path, offset=0):
520
+ print("Starting conversion process of .m4a to .WAV")
521
+ out_path = os.path.splitext(video_file_path)[0] + ".wav"
522
+
523
+ try:
524
+ if os.name == "nt":
525
+ logging.debug("ffmpeg being ran on windows")
526
+
527
+ if sys.platform.startswith('win'):
528
+ ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
529
+ else:
530
+ ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
531
+
532
+ command = [
533
+ ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
534
+ "-ss", "00:00:00", # Start at the beginning of the video
535
+ "-i", video_file_path,
536
+ "-ar", "16000", # Audio sample rate
537
+ "-ac", "1", # Number of audio channels
538
+ "-c:a", "pcm_s16le", # Audio codec
539
+ out_path
540
+ ]
541
+ try:
542
+ # Redirect stdin from null device to prevent ffmpeg from waiting for input
543
+ with open(os.devnull, 'rb') as null_file:
544
+ result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
545
+ if result.returncode == 0:
546
+ logging.info("FFmpeg executed successfully")
547
+ logging.debug("FFmpeg output: %s", result.stdout)
548
+ else:
549
+ logging.error("Error in running FFmpeg")
550
+ logging.error("FFmpeg stderr: %s", result.stderr)
551
+ raise RuntimeError(f"FFmpeg error: {result.stderr}")
552
+ except Exception as e:
553
+ logging.error("Error occurred - ffmpeg doesn't like windows")
554
+ raise RuntimeError("ffmpeg failed")
555
+ exit()
556
+ elif os.name == "posix":
557
+ os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
558
+ else:
559
+ raise RuntimeError("Unsupported operating system")
560
+ logging.info("Conversion to WAV completed: %s", out_path)
561
+ except subprocess.CalledProcessError as e:
562
+ logging.error("Error executing FFmpeg command: %s", str(e))
563
+ raise RuntimeError("Error converting video file to WAV")
564
+ except Exception as e:
565
+ logging.error("Unexpected error occurred: %s", str(e))
566
+ raise RuntimeError("Error converting video file to WAV")
567
+ return out_path
568
+
569
+
570
+
571
+ # Transcribe .wav into .segments.json
572
+ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False):
573
+ logging.info('Loading faster_whisper model: %s', whisper_model)
574
+ from faster_whisper import WhisperModel
575
+ model = WhisperModel(whisper_model, device=f"{processing_choice}")
576
+ time_start = time.time()
577
+ if audio_file_path is None:
578
+ raise ValueError("No audio file provided")
579
+ logging.info("Audio file path: %s", audio_file_path)
580
+
581
+ try:
582
+ _, file_ending = os.path.splitext(audio_file_path)
583
+ out_file = audio_file_path.replace(file_ending, ".segments.json")
584
+ if os.path.exists(out_file):
585
+ logging.info("Segments file already exists: %s", out_file)
586
+ with open(out_file) as f:
587
+ segments = json.load(f)
588
+ return segments
589
+
590
+ logging.info('Starting transcription...')
591
+ options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter)
592
+ transcribe_options = dict(task="transcribe", **options)
593
+ segments_raw, info = model.transcribe(audio_file_path, **transcribe_options)
594
+
595
+ segments = []
596
+ for segment_chunk in segments_raw:
597
+ chunk = {
598
+ "start": segment_chunk.start,
599
+ "end": segment_chunk.end,
600
+ "text": segment_chunk.text
601
+ }
602
+ logging.debug("Segment: %s", chunk)
603
+ segments.append(chunk)
604
+ logging.info("Transcription completed with faster_whisper")
605
+ with open(out_file, 'w') as f:
606
+ json.dump(segments, f, indent=2)
607
+ except Exception as e:
608
+ logging.error("Error transcribing audio: %s", str(e))
609
+ raise RuntimeError("Error transcribing audio")
610
+ return segments
611
+ #
612
+ #
613
+ ####################################################################################################################################
614
+
615
+
616
+
617
+
618
+
619
+
620
+ ####################################################################################################################################
621
+ # Diarization
622
+ #
623
+ # TODO: https://huggingface.co/pyannote/speaker-diarization-3.1
624
+ # embedding_model = "pyannote/embedding", embedding_size=512
625
+ # embedding_model = "speechbrain/spkrec-ecapa-voxceleb", embedding_size=192
626
+ def speaker_diarize(video_file_path, segments, embedding_model = "pyannote/embedding", embedding_size=512, num_speakers=0):
627
+ """
628
+ 1. Generating speaker embeddings for each segments.
629
+ 2. Applying agglomerative clustering on the embeddings to identify the speaker for each segment.
630
+ """
631
+ try:
632
+ from pyannote.audio import Audio
633
+ from pyannote.core import Segment
634
+ from pyannote.audio.pipelines.speaker_verification import PretrainedSpeakerEmbedding
635
+ import numpy as np
636
+ import pandas as pd
637
+ from sklearn.cluster import AgglomerativeClustering
638
+ from sklearn.metrics import silhouette_score
639
+ import tqdm
640
+ import wave
641
+
642
+ embedding_model = PretrainedSpeakerEmbedding( embedding_model, device=torch.device("cuda" if torch.cuda.is_available() else "cpu"))
643
+
644
+
645
+ _,file_ending = os.path.splitext(f'{video_file_path}')
646
+ audio_file = video_file_path.replace(file_ending, ".wav")
647
+ out_file = video_file_path.replace(file_ending, ".diarize.json")
648
+
649
+ logging.debug("getting duration of audio file")
650
+ with contextlib.closing(wave.open(audio_file,'r')) as f:
651
+ frames = f.getnframes()
652
+ rate = f.getframerate()
653
+ duration = frames / float(rate)
654
+ logging.debug("duration of audio file obtained")
655
+ print(f"duration of audio file: {duration}")
656
+
657
+ def segment_embedding(segment):
658
+ logging.debug("Creating embedding")
659
+ audio = Audio()
660
+ start = segment["start"]
661
+ end = segment["end"]
662
+
663
+ # Enforcing a minimum segment length
664
+ if end-start < 0.3:
665
+ padding = 0.3-(end-start)
666
+ start -= padding/2
667
+ end += padding/2
668
+ print('Padded segment because it was too short:',segment)
669
+
670
+ # Whisper overshoots the end timestamp in the last segment
671
+ end = min(duration, end)
672
+ # clip audio and embed
673
+ clip = Segment(start, end)
674
+ waveform, sample_rate = audio.crop(audio_file, clip)
675
+ return embedding_model(waveform[None])
676
+
677
+ embeddings = np.zeros(shape=(len(segments), embedding_size))
678
+ for i, segment in enumerate(tqdm.tqdm(segments)):
679
+ embeddings[i] = segment_embedding(segment)
680
+ embeddings = np.nan_to_num(embeddings)
681
+ print(f'Embedding shape: {embeddings.shape}')
682
+
683
+ if num_speakers == 0:
684
+ # Find the best number of speakers
685
+ score_num_speakers = {}
686
+
687
+ for num_speakers in range(2, 10+1):
688
+ clustering = AgglomerativeClustering(num_speakers).fit(embeddings)
689
+ score = silhouette_score(embeddings, clustering.labels_, metric='euclidean')
690
+ score_num_speakers[num_speakers] = score
691
+ best_num_speaker = max(score_num_speakers, key=lambda x:score_num_speakers[x])
692
+ print(f"The best number of speakers: {best_num_speaker} with {score_num_speakers[best_num_speaker]} score")
693
+ else:
694
+ best_num_speaker = num_speakers
695
+
696
+ # Assign speaker label
697
+ clustering = AgglomerativeClustering(best_num_speaker).fit(embeddings)
698
+ labels = clustering.labels_
699
+ for i in range(len(segments)):
700
+ segments[i]["speaker"] = 'SPEAKER ' + str(labels[i] + 1)
701
+
702
+ with open(out_file,'w') as f:
703
+ f.write(json.dumps(segments, indent=2))
704
+
705
+ # Make CSV output
706
+ def convert_time(secs):
707
+ return datetime.timedelta(seconds=round(secs))
708
+
709
+ objects = {
710
+ 'Start' : [],
711
+ 'End': [],
712
+ 'Speaker': [],
713
+ 'Text': []
714
+ }
715
+ text = ''
716
+ for (i, segment) in enumerate(segments):
717
+ if i == 0 or segments[i - 1]["speaker"] != segment["speaker"]:
718
+ objects['Start'].append(str(convert_time(segment["start"])))
719
+ objects['Speaker'].append(segment["speaker"])
720
+ if i != 0:
721
+ objects['End'].append(str(convert_time(segments[i - 1]["end"])))
722
+ objects['Text'].append(text)
723
+ text = ''
724
+ text += segment["text"] + ' '
725
+ objects['End'].append(str(convert_time(segments[i - 1]["end"])))
726
+ objects['Text'].append(text)
727
+
728
+ save_path = video_file_path.replace(file_ending, ".csv")
729
+ df_results = pd.DataFrame(objects)
730
+ df_results.to_csv(save_path)
731
+ return df_results, save_path
732
+
733
+ except Exception as e:
734
+ raise RuntimeError("Error Running inference with local model", e)
735
+ #
736
+ #
737
+ ####################################################################################################################################
738
+
739
+
740
+
741
+
742
+
743
+
744
+ ####################################################################################################################################
745
+ #Summarizers
746
+ #
747
+ #
748
+
749
+ # Summarize with OpenAI ChatGPT
750
+ def extract_text_from_segments(segments):
751
+ logging.debug(f"openai: extracting text from {segments}")
752
+ text = ' '.join([segment['text'] for segment in segments])
753
+ return text
754
+
755
+
756
+
757
+ def summarize_with_openai(api_key, file_path, model):
758
+ try:
759
+ logging.debug("openai: Loading json data for summarization")
760
+ with open(file_path, 'r') as file:
761
+ segments = json.load(file)
762
+
763
+ logging.debug("openai: Extracting text from the segments")
764
+ text = extract_text_from_segments(segments)
765
+
766
+ headers = {
767
+ 'Authorization': f'Bearer {api_key}',
768
+ 'Content-Type': 'application/json'
769
+ }
770
+
771
+ logging.debug("openai: Preparing data + prompt for submittal")
772
+ prompt_text = f"{text} \n\n\n\nPlease provide a detailed, bulleted list of the points made throughout the transcribed video and any supporting arguments made for said points"
773
+ data = {
774
+ "model": model,
775
+ "messages": [
776
+ {
777
+ "role": "system",
778
+ "content": "You are a professional summarizer."
779
+ },
780
+ {
781
+ "role": "user",
782
+ "content": prompt_text
783
+ }
784
+ ],
785
+ "max_tokens": 4096, # Adjust tokens as needed
786
+ "temperature": 0.7
787
+ }
788
+ logging.debug("openai: Posting request")
789
+ response = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, json=data)
790
+
791
+ if response.status_code == 200:
792
+ summary = response.json()['choices'][0]['message']['content'].strip()
793
+ logging.debug("openai: Summarization successful")
794
+ print("Summarization successful.")
795
+ return summary
796
+ else:
797
+ logging.debug("openai: Summarization failed")
798
+ print("Failed to process summary:", response.text)
799
+ return None
800
+ except Exception as e:
801
+ logging.debug("openai: Error in processing: %s", str(e))
802
+ print("Error occurred while processing summary with openai:", str(e))
803
+ return None
804
+
805
+
806
+
807
+ def summarize_with_claude(api_key, file_path, model):
808
+ try:
809
+ logging.debug("anthropic: Loading JSON data")
810
+ with open(file_path, 'r') as file:
811
+ segments = json.load(file)
812
+
813
+ logging.debug("anthropic: Extracting text from the segments file")
814
+ text = extract_text_from_segments(segments)
815
+
816
+ headers = {
817
+ 'x-api-key': api_key,
818
+ 'anthropic-version': '2023-06-01',
819
+ 'Content-Type': 'application/json'
820
+ }
821
+
822
+ logging.debug("anthropic: Prepping data + prompt for submittal")
823
+ user_message = {
824
+ "role": "user",
825
+ "content": f"{text} \n\n\n\nPlease provide a detailed, bulleted list of the points made throughout the transcribed video and any supporting arguments made for said points"
826
+ }
827
+
828
+ data = {
829
+ "model": model,
830
+ "max_tokens": 4096, # max _possible_ tokens to return
831
+ "messages": [user_message],
832
+ "stop_sequences": ["\n\nHuman:"],
833
+ "temperature": 0.7,
834
+ "top_k": 0,
835
+ "top_p": 1.0,
836
+ "metadata": {
837
+ "user_id": "example_user_id",
838
+ },
839
+ "stream": False,
840
+ "system": "You are a professional summarizer."
841
+ }
842
+
843
+ logging.debug("anthropic: Posting request to API")
844
+ response = requests.post('https://api.anthropic.com/v1/messages', headers=headers, json=data)
845
+
846
+ # Check if the status code indicates success
847
+ if response.status_code == 200:
848
+ logging.debug("anthropic: Post submittal successful")
849
+ response_data = response.json()
850
+ try:
851
+ summary = response_data['content'][0]['text'].strip()
852
+ logging.debug("anthropic: Summarization succesful")
853
+ print("Summary processed successfully.")
854
+ return summary
855
+ except (IndexError, KeyError) as e:
856
+ logging.debug("anthropic: Unexpected data in response")
857
+ print("Unexpected response format from Claude API:", response.text)
858
+ return None
859
+ elif response.status_code == 500: # Handle internal server error specifically
860
+ logging.debug("anthropic: Internal server error")
861
+ print("Internal server error from API. Retrying may be necessary.")
862
+ return None
863
+ else:
864
+ logging.debug(f"anthropic: Failed to summarize, status code {response.status_code}: {response.text}")
865
+ print(f"Failed to process summary, status code {response.status_code}: {response.text}")
866
+ return None
867
+
868
+ except Exception as e:
869
+ logging.debug("anthropic: Error in processing: %s", str(e))
870
+ print("Error occurred while processing summary with anthropic:", str(e))
871
+ return None
872
+
873
+
874
+
875
+ # Summarize with Cohere
876
+ def summarize_with_cohere(api_key, file_path, model):
877
+ try:
878
+ logging.basicConfig(level=logging.DEBUG)
879
+ logging.debug("cohere: Loading JSON data")
880
+ with open(file_path, 'r') as file:
881
+ segments = json.load(file)
882
+
883
+ logging.debug(f"cohere: Extracting text from segments file")
884
+ text = extract_text_from_segments(segments)
885
+
886
+ headers = {
887
+ 'accept': 'application/json',
888
+ 'content-type': 'application/json',
889
+ 'Authorization': f'Bearer {api_key}'
890
+ }
891
+
892
+ prompt_text = f"{text} \n\nAs a professional summarizer, create a concise and comprehensive summary of the provided text."
893
+ data = {
894
+ "chat_history": [
895
+ {"role": "USER", "message": prompt_text}
896
+ ],
897
+ "message": "Please provide a summary.",
898
+ "model": model,
899
+ "connectors": [{"id": "web-search"}]
900
+ }
901
+
902
+ logging.debug("cohere: Submitting request to API endpoint")
903
+ print("cohere: Submitting request to API endpoint")
904
+ response = requests.post('https://api.cohere.ai/v1/chat', headers=headers, json=data)
905
+ response_data = response.json()
906
+ logging.debug("API Response Data: %s", response_data)
907
+
908
+ if response.status_code == 200:
909
+ if 'text' in response_data:
910
+ summary = response_data['text'].strip()
911
+ logging.debug("cohere: Summarization successful")
912
+ print("Summary processed successfully.")
913
+ return summary
914
+ else:
915
+ logging.error("Expected data not found in API response.")
916
+ return "Expected data not found in API response."
917
+ else:
918
+ logging.error(f"cohere: API request failed with status code {response.status_code}: {resposne.text}")
919
+ print(f"Failed to process summary, status code {response.status_code}: {response.text}")
920
+ return f"cohere: API request failed: {response.text}"
921
+
922
+ except Exception as e:
923
+ logging.error("cohere: Error in processing: %s", str(e))
924
+ return f"cohere: Error occurred while processing summary with Cohere: {str(e)}"
925
+
926
+
927
+
928
+ # https://console.groq.com/docs/quickstart
929
+ def summarize_with_groq(api_key, file_path, model):
930
+ try:
931
+ logging.debug("groq: Loading JSON data")
932
+ with open(file_path, 'r') as file:
933
+ segments = json.load(file)
934
+
935
+ logging.debug(f"groq: Extracting text from segments file")
936
+ text = extract_text_from_segments(segments)
937
+
938
+ headers = {
939
+ 'Authorization': f'Bearer {api_key}',
940
+ 'Content-Type': 'application/json'
941
+ }
942
+
943
+ prompt_text = f"{text} \n\nAs a professional summarizer, create a concise and comprehensive summary of the provided text."
944
+ data = {
945
+ "messages": [
946
+ {
947
+ "role": "user",
948
+ "content": prompt_text
949
+ }
950
+ ],
951
+ "model": model
952
+ }
953
+
954
+ logging.debug("groq: Submitting request to API endpoint")
955
+ print("groq: Submitting request to API endpoint")
956
+ response = requests.post('https://api.groq.com/openai/v1/chat/completions', headers=headers, json=data)
957
+
958
+ response_data = response.json()
959
+ logging.debug("API Response Data: %s", response_data)
960
+
961
+ if response.status_code == 200:
962
+ if 'choices' in response_data and len(response_data['choices']) > 0:
963
+ summary = response_data['choices'][0]['message']['content'].strip()
964
+ logging.debug("groq: Summarization successful")
965
+ print("Summarization successful.")
966
+ return summary
967
+ else:
968
+ logging.error("Expected data not found in API response.")
969
+ return "Expected data not found in API response."
970
+ else:
971
+ logging.error(f"groq: API request failed with status code {response.status_code}: {response.text}")
972
+ return f"groq: API request failed: {response.text}"
973
+
974
+ except Exception as e:
975
+ logging.error("groq: Error in processing: %s", str(e))
976
+ return f"groq: Error occurred while processing summary with groq: {str(e)}"
977
+
978
+
979
+ #################################
980
+ #
981
+ # Local Summarization
982
+
983
+ def summarize_with_llama(api_url, file_path, token):
984
+ try:
985
+ logging.debug("llama: Loading JSON data")
986
+ with open(file_path, 'r') as file:
987
+ segments = json.load(file)
988
+
989
+ logging.debug(f"llama: Extracting text from segments file")
990
+ text = extract_text_from_segments(segments) # Define this function to extract text properly
991
+
992
+ headers = {
993
+ 'accept': 'application/json',
994
+ 'content-type': 'application/json',
995
+ }
996
+ if len(token)>5:
997
+ headers['Authorization'] = f'Bearer {token}'
998
+
999
+
1000
+ prompt_text = f"{text} \n\nAs a professional summarizer, create a concise and comprehensive summary of the provided text."
1001
+ data = {
1002
+ "prompt": prompt_text
1003
+ }
1004
+
1005
+ logging.debug("llama: Submitting request to API endpoint")
1006
+ print("llama: Submitting request to API endpoint")
1007
+ response = requests.post(api_url, headers=headers, json=data)
1008
+ response_data = response.json()
1009
+ logging.debug("API Response Data: %s", response_data)
1010
+
1011
+ if response.status_code == 200:
1012
+ #if 'X' in response_data:
1013
+ logging.debug(response_data)
1014
+ summary = response_data['content'].strip()
1015
+ logging.debug("llama: Summarization successful")
1016
+ print("Summarization successful.")
1017
+ return summary
1018
+ else:
1019
+ logging.error(f"llama: API request failed with status code {response.status_code}: {response.text}")
1020
+ return f"llama: API request failed: {response.text}"
1021
+
1022
+ except Exception as e:
1023
+ logging.error("llama: Error in processing: %s", str(e))
1024
+ return f"llama: Error occurred while processing summary with llama: {str(e)}"
1025
+
1026
+
1027
+
1028
+ # https://lite.koboldai.net/koboldcpp_api#/api%2Fv1/post_api_v1_generate
1029
+ def summarize_with_kobold(api_url, file_path):
1030
+ try:
1031
+ logging.debug("kobold: Loading JSON data")
1032
+ with open(file_path, 'r') as file:
1033
+ segments = json.load(file)
1034
+
1035
+ logging.debug(f"kobold: Extracting text from segments file")
1036
+ text = extract_text_from_segments(segments)
1037
+
1038
+ headers = {
1039
+ 'accept': 'application/json',
1040
+ 'content-type': 'application/json',
1041
+ }
1042
+ # FIXME
1043
+ prompt_text = f"{text} \n\nAs a professional summarizer, create a concise and comprehensive summary of the above text."
1044
+ logging.debug(prompt_text)
1045
+ # Values literally c/p from the api docs....
1046
+ data = {
1047
+ "max_context_length": 8096,
1048
+ "max_length": 4096,
1049
+ "prompt": prompt_text,
1050
+ }
1051
+
1052
+ logging.debug("kobold: Submitting request to API endpoint")
1053
+ print("kobold: Submitting request to API endpoint")
1054
+ response = requests.post(api_url, headers=headers, json=data)
1055
+ response_data = response.json()
1056
+ logging.debug("kobold: API Response Data: %s", response_data)
1057
+
1058
+ if response.status_code == 200:
1059
+ if 'results' in response_data and len(response_data['results']) > 0:
1060
+ summary = response_data['results'][0]['text'].strip()
1061
+ logging.debug("kobold: Summarization successful")
1062
+ print("Summarization successful.")
1063
+ return summary
1064
+ else:
1065
+ logging.error("Expected data not found in API response.")
1066
+ return "Expected data not found in API response."
1067
+ else:
1068
+ logging.error(f"kobold: API request failed with status code {response.status_code}: {response.text}")
1069
+ return f"kobold: API request failed: {response.text}"
1070
+
1071
+ except Exception as e:
1072
+ logging.error("kobold: Error in processing: %s", str(e))
1073
+ return f"kobold: Error occurred while processing summary with kobold: {str(e)}"
1074
+
1075
+
1076
+
1077
+ # https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API
1078
+ def summarize_with_oobabooga(api_url, file_path):
1079
+ try:
1080
+ logging.debug("ooba: Loading JSON data")
1081
+ with open(file_path, 'r') as file:
1082
+ segments = json.load(file)
1083
+
1084
+ logging.debug(f"ooba: Extracting text from segments file\n\n\n")
1085
+ text = extract_text_from_segments(segments)
1086
+ logging.debug(f"ooba: Finished extracting text from segments file")
1087
+
1088
+ headers = {
1089
+ 'accept': 'application/json',
1090
+ 'content-type': 'application/json',
1091
+ }
1092
+
1093
+ prompt_text = "I like to eat cake and bake cakes. I am a baker. I work in a french bakery baking cakes. It is a fun job. I have been baking cakes for ten years. I also bake lots of other baked goods, but cakes are my favorite."
1094
+ # prompt_text += f"\n\n{text}" # Uncomment this line if you want to include the text variable
1095
+ prompt_text += "\n\nAs a professional summarizer, create a concise and comprehensive summary of the provided text."
1096
+
1097
+ data = {
1098
+ "mode": "chat",
1099
+ "character": "Example",
1100
+ "messages": [{"role": "user", "content": prompt_text}]
1101
+ }
1102
+
1103
+ logging.debug("ooba: Submitting request to API endpoint")
1104
+ print("ooba: Submitting request to API endpoint")
1105
+ response = requests.post(api_url, headers=headers, json=data, verify=False)
1106
+ logging.debug("ooba: API Response Data: %s", response)
1107
+
1108
+ if response.status_code == 200:
1109
+ response_data = response.json()
1110
+ summary = response.json()['choices'][0]['message']['content']
1111
+ logging.debug("ooba: Summarization successful")
1112
+ print("Summarization successful.")
1113
+ return summary
1114
+ else:
1115
+ logging.error(f"oobabooga: API request failed with status code {response.status_code}: {response.text}")
1116
+ return f"ooba: API request failed with status code {response.status_code}: {response.text}"
1117
+
1118
+ except Exception as e:
1119
+ logging.error("ooba: Error in processing: %s", str(e))
1120
+ return f"ooba: Error occurred while processing summary with oobabooga: {str(e)}"
1121
+
1122
+
1123
+
1124
+ def save_summary_to_file(summary, file_path):
1125
+ summary_file_path = file_path.replace('.segments.json', '_summary.txt')
1126
+ logging.debug("Opening summary file for writing, *segments.json with *_summary.txt")
1127
+ with open(summary_file_path, 'w') as file:
1128
+ file.write(summary)
1129
+ logging.info(f"Summary saved to file: {summary_file_path}")
1130
+
1131
+ #
1132
+ #
1133
+ ####################################################################################################################################
1134
+
1135
+
1136
+
1137
+
1138
+
1139
+
1140
+ ####################################################################################################################################
1141
+ # Gradio UI
1142
+ #
1143
+
1144
+ # Only to be used when configured with Gradio for HF Space
1145
+ def summarize_with_huggingface(api_key, file_path):
1146
+ logging.debug(f"huggingface: Summarization process starting...")
1147
+ try:
1148
+ logging.debug("huggingface: Loading json data for summarization")
1149
+ with open(file_path, 'r') as file:
1150
+ segments = json.load(file)
1151
+
1152
+ logging.debug("huggingface: Extracting text from the segments")
1153
+ text = ' '.join([segment['text'] for segment in segments])
1154
+
1155
+ api_key = os.environ.get('HF_TOKEN')
1156
+ headers = {
1157
+ "Authorization": f"Bearer {api_key}"
1158
+ }
1159
+ model = "microsoft/Phi-3-mini-128k-instruct"
1160
+ API_URL = f"https://api-inference.huggingface.co/models/{model}"
1161
+ data = {
1162
+ "inputs": text,
1163
+ "parameters": {"max_length": 512, "min_length": 100} # You can adjust max_length and min_length as needed
1164
+ }
1165
+
1166
+ logging.debug("huggingface: Submitting request...")
1167
+ response = requests.post(API_URL, headers=headers, json=data)
1168
+
1169
+ if response.status_code == 200:
1170
+ summary = response.json()[0]['summary_text']
1171
+ logging.debug("huggingface: Summarization successful")
1172
+ print("Summarization successful.")
1173
+ return summary
1174
+ else:
1175
+ logging.error(f"huggingface: Summarization failed with status code {response.status_code}: {response.text}")
1176
+ return f"Failed to process summary, status code {response.status_code}: {response.text}"
1177
+ except Exception as e:
1178
+ logging.error("huggingface: Error in processing: %s", str(e))
1179
+ print(f"Error occurred while processing summary with huggingface: {str(e)}")
1180
+ return None
1181
+
1182
+
1183
+
1184
+ def same_auth(username, password):
1185
+ return username == password
1186
+
1187
+
1188
+
1189
+ def launch_ui(demo_mode=False):
1190
+ def process_transcription(json_data):
1191
+ if json_data:
1192
+ return "\n".join([item["text"] for item in json_data])
1193
+ else:
1194
+ return ""
1195
+
1196
+ inputs = [
1197
+ gr.components.Textbox(label="URL"),
1198
+ gr.components.Number(value=2, label="Number of Speakers"),
1199
+ gr.components.Dropdown(choices=whisper_models, value="small.en", label="Whisper Model"),
1200
+ gr.components.Number(value=0, label="Offset")
1201
+ ]
1202
+
1203
+ if not demo_mode:
1204
+ inputs.extend([
1205
+ gr.components.Dropdown(choices=["huggingface", "openai", "anthropic", "cohere", "groq", "llama", "kobold", "ooba"], value="anthropic", label="API Name"),
1206
+ gr.components.Textbox(label="API Key"),
1207
+ gr.components.Checkbox(value=False, label="VAD Filter"),
1208
+ gr.components.Checkbox(value=False, label="Download Video")
1209
+ ])
1210
+
1211
+ iface = gr.Interface(
1212
+ fn=lambda *args: process_url(*args, demo_mode=demo_mode),
1213
+ inputs=inputs,
1214
+ outputs=[
1215
+ gr.components.Textbox(label="Transcription", value=lambda: "", max_lines=10),
1216
+ gr.components.Textbox(label="Summary"),
1217
+ gr.components.File(label="Download Transcription as JSON"),
1218
+ gr.components.File(label="Download Summary as text", visible=lambda summary_file_path: summary_file_path is not None)
1219
+ ],
1220
+ title="Video Transcription and Summarization",
1221
+ description="Submit a video URL for transcription and summarization.",
1222
+ allow_flagging="never"
1223
+ )
1224
+
1225
+ iface.launch(share=True)
1226
+
1227
+ #
1228
+ #
1229
+ #####################################################################################################################################
1230
+
1231
+
1232
+
1233
+
1234
+
1235
+
1236
+
1237
+ ####################################################################################################################################
1238
+ # Main()
1239
+ #
1240
+ def main(input_path, api_name=None, api_key=None, num_speakers=2, whisper_model="small.en", offset=0, vad_filter=False, download_video_flag=False):
1241
+ if input_path is None and args.user_interface:
1242
+ return []
1243
+ start_time = time.monotonic()
1244
+ paths = [] # Initialize paths as an empty list
1245
+ if os.path.isfile(input_path) and input_path.endswith('.txt'):
1246
+ logging.debug("MAIN: User passed in a text file, processing text file...")
1247
+ paths = read_paths_from_file(input_path)
1248
+ elif os.path.exists(input_path):
1249
+ logging.debug("MAIN: Local file path detected")
1250
+ paths = [input_path]
1251
+ elif (info_dict := get_youtube(input_path)) and 'entries' in info_dict:
1252
+ logging.debug("MAIN: YouTube playlist detected")
1253
+ print("\n\nSorry, but playlists aren't currently supported. You can run the following command to generate a text file that you can then pass into this script though! (It may not work... playlist support seems spotty)" + """\n\n\tpython Get_Playlist_URLs.py <Youtube Playlist URL>\n\n\tThen,\n\n\tpython diarizer.py <playlist text file name>\n\n""")
1254
+ return
1255
+ else:
1256
+ paths = [input_path]
1257
+ results = []
1258
+
1259
+ for path in paths:
1260
+ try:
1261
+ if path.startswith('http'):
1262
+ logging.debug("MAIN: URL Detected")
1263
+ info_dict = get_youtube(path)
1264
+ if info_dict:
1265
+ logging.debug("MAIN: Creating path for video file...")
1266
+ download_path = create_download_directory(info_dict['title'])
1267
+ logging.debug("MAIN: Path created successfully")
1268
+ logging.debug("MAIN: Downloading video from yt_dlp...")
1269
+ video_path = download_video(path, download_path, info_dict, download_video_flag)
1270
+ logging.debug("MAIN: Video downloaded successfully")
1271
+ logging.debug("MAIN: Converting video file to WAV...")
1272
+ audio_file = convert_to_wav(video_path, offset)
1273
+ logging.debug("MAIN: Audio file converted succesfully")
1274
+ else:
1275
+ if os.path.exists(path):
1276
+ logging.debug("MAIN: Local file path detected")
1277
+ download_path, info_dict, audio_file = process_local_file(path)
1278
+ else:
1279
+ logging.error(f"File does not exist: {path}")
1280
+ continue
1281
+
1282
+ if info_dict:
1283
+ logging.debug("MAIN: Creating transcription file from WAV")
1284
+ segments = speech_to_text(audio_file, whisper_model=whisper_model, vad_filter=vad_filter)
1285
+ transcription_result = {
1286
+ 'video_path': path,
1287
+ 'audio_file': audio_file,
1288
+ 'transcription': segments
1289
+ }
1290
+ results.append(transcription_result)
1291
+ logging.info(f"Transcription complete: {audio_file}")
1292
+
1293
+ # Perform summarization based on the specified API
1294
+ if api_name and api_key:
1295
+ logging.debug(f"MAIN: Summarization being performed by {api_name}")
1296
+ json_file_path = audio_file.replace('.wav', '.segments.json')
1297
+ if api_name.lower() == 'openai':
1298
+ api_key = openai_api_key
1299
+ try:
1300
+ logging.debug(f"MAIN: trying to summarize with openAI")
1301
+ summary = summarize_with_openai(api_key, json_file_path, openai_model)
1302
+ except requests.exceptions.ConnectionError:
1303
+ r.status_code = "Connection: "
1304
+ elif api_name.lower() == 'anthropic':
1305
+ api_key = anthropic_api_key
1306
+ try:
1307
+ logging.debug(f"MAIN: Trying to summarize with anthropic")
1308
+ summary = summarize_with_claude(api_key, json_file_path, anthropic_model)
1309
+ except requests.exceptions.ConnectionError:
1310
+ r.status_code = "Connection: "
1311
+ elif api_name.lower() == 'cohere':
1312
+ api_key = cohere_api_key
1313
+ try:
1314
+ logging.debug(f"MAIN: Trying to summarize with cohere")
1315
+ summary = summarize_with_cohere(api_key, json_file_path, cohere_model)
1316
+ except requests.exceptions.ConnectionError:
1317
+ r.status_code = "Connection: "
1318
+ elif api_name.lower() == 'groq':
1319
+ api_key = groq_api_key
1320
+ try:
1321
+ logging.debug(f"MAIN: Trying to summarize with Groq")
1322
+ summary = summarize_with_groq(api_key, json_file_path, groq_model)
1323
+ except requests.exceptions.ConnectionError:
1324
+ r.status_code = "Connection: "
1325
+ elif api_name.lower() == 'llama':
1326
+ token = llama_api_key
1327
+ llama_ip = llama_api_IP
1328
+ try:
1329
+ logging.debug(f"MAIN: Trying to summarize with Llama.cpp")
1330
+ summary = summarize_with_llama(llama_ip, json_file_path, token)
1331
+ except requests.exceptions.ConnectionError:
1332
+ r.status_code = "Connection: "
1333
+ elif api_name.lower() == 'kobold':
1334
+ token = kobold_api_key
1335
+ kobold_ip = kobold_api_IP
1336
+ try:
1337
+ logging.debug(f"MAIN: Trying to summarize with kobold.cpp")
1338
+ summary = summarize_with_kobold(kobold_ip, json_file_path)
1339
+ except requests.exceptions.ConnectionError:
1340
+ r.status_code = "Connection: "
1341
+ elif api_name.lower() == 'ooba':
1342
+ token = ooba_api_key
1343
+ ooba_ip = ooba_api_IP
1344
+ try:
1345
+ logging.debug(f"MAIN: Trying to summarize with oobabooga")
1346
+ summary = summarize_with_oobabooga(ooba_ip, json_file_path)
1347
+ except requests.exceptions.ConnectionError:
1348
+ r.status_code = "Connection: "
1349
+ if api_name.lower() == 'huggingface':
1350
+ api_key = huggingface_api_key
1351
+ try:
1352
+ logging.debug(f"MAIN: Trying to summarize with huggingface")
1353
+ summarize_with_huggingface(api_key, json_file_path)
1354
+ except requests.exceptions.ConnectionError:
1355
+ r.status_code = "Connection: "
1356
+
1357
+ else:
1358
+ logging.warning(f"Unsupported API: {api_name}")
1359
+ summary = None
1360
+
1361
+ if summary:
1362
+ transcription_result['summary'] = summary
1363
+ logging.info(f"Summary generated using {api_name} API")
1364
+ save_summary_to_file(summary, json_file_path)
1365
+ else:
1366
+ logging.warning(f"Failed to generate summary using {api_name} API")
1367
+ else:
1368
+ logging.info("No API specified. Summarization will not be performed")
1369
+ except Exception as e:
1370
+ logging.error(f"Error processing path: {path}")
1371
+ logging.error(str(e))
1372
+ end_time = time.monotonic()
1373
+ #print("Total program execution time: " + timedelta(seconds=end_time - start_time))
1374
+
1375
+ return results
1376
+
1377
+
1378
+
1379
+ if __name__ == "__main__":
1380
+ parser = argparse.ArgumentParser(description='Transcribe and summarize videos.')
1381
+ parser.add_argument('input_path', type=str, help='Path or URL of the video', nargs='?')
1382
+ parser.add_argument('-v','--video', action='store_true', help='Download the video instead of just the audio')
1383
+ parser.add_argument('-api', '--api_name', type=str, help='API name for summarization (optional)')
1384
+ parser.add_argument('-key', '--api_key', type=str, help='API key for summarization (optional)')
1385
+ parser.add_argument('-ns', '--num_speakers', type=int, default=2, help='Number of speakers (default: 2)')
1386
+ parser.add_argument('-wm', '--whisper_model', type=str, default='small.en', help='Whisper model (default: small.en)')
1387
+ parser.add_argument('-off', '--offset', type=int, default=0, help='Offset in seconds (default: 0)')
1388
+ parser.add_argument('-vad', '--vad_filter', action='store_true', help='Enable VAD filter')
1389
+ parser.add_argument('-log', '--log_level', type=str, default='INFO', choices=['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'], help='Log level (default: INFO)')
1390
+ parser.add_argument('-ui', '--user_interface', action='store_true', help='Launch the Gradio user interface')
1391
+ parser.add_argument('-demo', '--demo_mode', action='store_true', help='Enable demo mode')
1392
+ #parser.add_argument('--log_file', action=str, help='Where to save logfile (non-default)')
1393
+ args = parser.parse_args()
1394
+
1395
+ # Since this is running in HF....
1396
+ args.user_interface = True
1397
+ if args.user_interface:
1398
+ launch_ui(demo_mode=args.demo_mode)
1399
+ else:
1400
+ if not args.input_path:
1401
+ parser.print_help()
1402
+ sys.exit(1)
1403
+
1404
+ logging.basicConfig(level=getattr(logging, args.log_level), format='%(asctime)s - %(levelname)s - %(message)s')
1405
+
1406
+ logging.info('Starting the transcription and summarization process.')
1407
+ logging.info(f'Input path: {args.input_path}')
1408
+ logging.info(f'API Name: {args.api_name}')
1409
+ logging.debug(f'API Key: {args.api_key}') # ehhhhh
1410
+ logging.info(f'Number of speakers: {args.num_speakers}')
1411
+ logging.info(f'Whisper model: {args.whisper_model}')
1412
+ logging.info(f'Offset: {args.offset}')
1413
+ logging.info(f'VAD filter: {args.vad_filter}')
1414
+ logging.info(f'Log Level: {args.log_level}') #lol
1415
+
1416
+ if args.api_name and args.api_key:
1417
+ logging.info(f'API: {args.api_name}')
1418
+ logging.info('Summarization will be performed.')
1419
+ else:
1420
+ logging.info('No API specified. Summarization will not be performed.')
1421
+
1422
+ logging.debug("Platform check being performed...")
1423
+ platform_check()
1424
+ logging.debug("CUDA check being performed...")
1425
+ cuda_check()
1426
+ logging.debug("ffmpeg check being performed...")
1427
+ check_ffmpeg()
1428
+
1429
+ try:
1430
+ results = main(args.input_path, api_name=args.api_name, api_key=args.api_key, num_speakers=args.num_speakers, whisper_model=args.whisper_model, offset=args.offset, vad_filter=args.vad_filter, download_video_flag=args.video)
1431
+ logging.info('Transcription process completed.')
1432
+ except Exception as e:
1433
+ logging.error('An error occurred during the transcription process.')
1434
+ logging.error(str(e))
1435
+ sys.exit(1)
1436
+