oceansweep commited on
Commit
dda5666
1 Parent(s): 8846bdc

Upload Utils.py

Browse files
Files changed (1) hide show
  1. App_Function_Libraries/Utils.py +499 -468
App_Function_Libraries/Utils.py CHANGED
@@ -1,468 +1,499 @@
1
- # Utils.py
2
- #########################################
3
- # General Utilities Library
4
- # This library is used to hold random utilities used by various other libraries.
5
- #
6
- ####
7
- ####################
8
- # Function List
9
- #
10
- # 1. extract_text_from_segments(segments: List[Dict]) -> str
11
- # 2. download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5)
12
- # 3. verify_checksum(file_path, expected_checksum)
13
- # 4. create_download_directory(title)
14
- # 5. sanitize_filename(filename)
15
- # 6. normalize_title(title)
16
- # 7.
17
- #
18
- #
19
- #
20
- ####################
21
- # Import necessary libraries
22
- import configparser
23
- import hashlib
24
- import json
25
- import logging
26
- import os
27
- import re
28
- import time
29
- from datetime import timedelta
30
- from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
31
-
32
- import requests
33
- import unicodedata
34
- from tqdm import tqdm
35
-
36
- from App_Function_Libraries.Video_DL_Ingestion_Lib import get_youtube
37
-
38
-
39
- #######################################################################################################################
40
- # Function Definitions
41
- #
42
-
43
- def extract_text_from_segments(segments):
44
- logging.debug(f"Segments received: {segments}")
45
- logging.debug(f"Type of segments: {type(segments)}")
46
-
47
- def extract_text_recursive(data):
48
- if isinstance(data, dict):
49
- for key, value in data.items():
50
- if key == 'Text':
51
- return value
52
- elif isinstance(value, (dict, list)):
53
- result = extract_text_recursive(value)
54
- if result:
55
- return result
56
- elif isinstance(data, list):
57
- return ' '.join(filter(None, [extract_text_recursive(item) for item in data]))
58
- return None
59
-
60
- text = extract_text_recursive(segments)
61
-
62
- if text:
63
- return text.strip()
64
- else:
65
- logging.error(f"Unable to extract text from segments: {segments}")
66
- return "Error: Unable to extract transcription"
67
-
68
-
69
- def download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5):
70
- temp_path = dest_path + '.tmp'
71
-
72
- for attempt in range(max_retries):
73
- try:
74
- # Check if a partial download exists and get its size
75
- resume_header = {}
76
- if os.path.exists(temp_path):
77
- resume_header = {'Range': f'bytes={os.path.getsize(temp_path)}-'}
78
-
79
- response = requests.get(url, stream=True, headers=resume_header)
80
- response.raise_for_status()
81
-
82
- # Get the total file size from headers
83
- total_size = int(response.headers.get('content-length', 0))
84
- initial_pos = os.path.getsize(temp_path) if os.path.exists(temp_path) else 0
85
-
86
- mode = 'ab' if 'Range' in response.headers else 'wb'
87
- with open(temp_path, mode) as temp_file, tqdm(
88
- total=total_size, unit='B', unit_scale=True, desc=dest_path, initial=initial_pos, ascii=True
89
- ) as pbar:
90
- for chunk in response.iter_content(chunk_size=8192):
91
- if chunk: # filter out keep-alive new chunks
92
- temp_file.write(chunk)
93
- pbar.update(len(chunk))
94
-
95
- # Verify the checksum if provided
96
- if expected_checksum:
97
- if not verify_checksum(temp_path, expected_checksum):
98
- os.remove(temp_path)
99
- raise ValueError("Downloaded file's checksum does not match the expected checksum")
100
-
101
- # Move the file to the final destination
102
- os.rename(temp_path, dest_path)
103
- print("Download complete and verified!")
104
- return dest_path
105
-
106
- except Exception as e:
107
- print(f"Attempt {attempt + 1} failed: {e}")
108
- if attempt < max_retries - 1:
109
- print(f"Retrying in {delay} seconds...")
110
- time.sleep(delay)
111
- else:
112
- print("Max retries reached. Download failed.")
113
- raise
114
-
115
-
116
- def verify_checksum(file_path, expected_checksum):
117
- sha256_hash = hashlib.sha256()
118
- with open(file_path, 'rb') as f:
119
- for byte_block in iter(lambda: f.read(4096), b''):
120
- sha256_hash.update(byte_block)
121
- return sha256_hash.hexdigest() == expected_checksum
122
-
123
-
124
- def create_download_directory(title):
125
- base_dir = "Results"
126
- # Remove characters that are illegal in Windows filenames and normalize
127
- safe_title = normalize_title(title)
128
- logging.debug(f"{title} successfully normalized")
129
- session_path = os.path.join(base_dir, safe_title)
130
- if not os.path.exists(session_path):
131
- os.makedirs(session_path, exist_ok=True)
132
- logging.debug(f"Created directory for downloaded video: {session_path}")
133
- else:
134
- logging.debug(f"Directory already exists for downloaded video: {session_path}")
135
- return session_path
136
-
137
-
138
- def sanitize_filename(filename):
139
- # Remove invalid characters and replace spaces with underscores
140
- sanitized = re.sub(r'[<>:"/\\|?*]', '', filename)
141
- sanitized = re.sub(r'\s+', ' ', sanitized).strip()
142
- return sanitized
143
-
144
-
145
- def normalize_title(title):
146
- # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
147
- title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
148
- title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '').replace('*', '').replace('?',
149
- '').replace(
150
- '<', '').replace('>', '').replace('|', '')
151
- return title
152
-
153
-
154
- def clean_youtube_url(url):
155
- parsed_url = urlparse(url)
156
- query_params = parse_qs(parsed_url.query)
157
- if 'list' in query_params:
158
- query_params.pop('list')
159
- cleaned_query = urlencode(query_params, doseq=True)
160
- cleaned_url = urlunparse(parsed_url._replace(query=cleaned_query))
161
- return cleaned_url
162
-
163
-
164
- def extract_video_info(url):
165
- info_dict = get_youtube(url)
166
- title = info_dict.get('title', 'Untitled')
167
- return info_dict, title
168
-
169
-
170
- def import_data(file):
171
- # Implement this function to import data from a file
172
- pass
173
-
174
-
175
- def safe_read_file(file_path):
176
- encodings = ['utf-8', 'utf-16', 'ascii', 'latin-1', 'iso-8859-1', 'cp1252']
177
- for encoding in encodings:
178
- try:
179
- with open(file_path, 'r', encoding=encoding) as file:
180
- return file.read()
181
- except UnicodeDecodeError:
182
- continue
183
- except FileNotFoundError:
184
- return f"File not found: {file_path}"
185
- except Exception as e:
186
- return f"An error occurred: {e}"
187
- return f"Unable to decode the file {file_path} with any of the attempted encodings: {encodings}"
188
-
189
- #
190
- #
191
- #######################
192
- # Temp file cleanup
193
- #
194
- # Global list to keep track of downloaded files
195
- downloaded_files = []
196
-
197
- def cleanup_downloads():
198
- """Function to clean up downloaded files when the server exits."""
199
- for file_path in downloaded_files:
200
- try:
201
- if os.path.exists(file_path):
202
- os.remove(file_path)
203
- print(f"Cleaned up file: {file_path}")
204
- except Exception as e:
205
- print(f"Error cleaning up file {file_path}: {e}")
206
-
207
- #
208
- #
209
- #######################
210
- # Config loading
211
- #
212
-
213
- def load_comprehensive_config():
214
- # Get the directory of the current script
215
- current_dir = os.path.dirname(os.path.abspath(__file__))
216
- # Go up one level to the project root directory
217
- project_root = os.path.dirname(current_dir)
218
- # Construct the path to the config file in the project root directory
219
- config_path = os.path.join(project_root, 'config.txt')
220
- # Create a ConfigParser object
221
- config = configparser.ConfigParser()
222
- # Read the configuration file
223
- files_read = config.read(config_path)
224
- if not files_read:
225
- raise FileNotFoundError(f"Config file not found at {config_path}")
226
- return config
227
-
228
-
229
- # FIXME - update to include prompt path in return statement
230
- def load_and_log_configs():
231
- try:
232
- config = load_comprehensive_config()
233
- if config is None:
234
- logging.error("Config is None, cannot proceed")
235
- return None
236
- # API Keys
237
- anthropic_api_key = config.get('API', 'anthropic_api_key', fallback=None)
238
- logging.debug(
239
- f"Loaded Anthropic API Key: {anthropic_api_key[:5]}...{anthropic_api_key[-5:] if anthropic_api_key else None}")
240
-
241
- cohere_api_key = config.get('API', 'cohere_api_key', fallback=None)
242
- logging.debug(
243
- f"Loaded Cohere API Key: {cohere_api_key[:5]}...{cohere_api_key[-5:] if cohere_api_key else None}")
244
-
245
- groq_api_key = config.get('API', 'groq_api_key', fallback=None)
246
- logging.debug(f"Loaded Groq API Key: {groq_api_key[:5]}...{groq_api_key[-5:] if groq_api_key else None}")
247
-
248
- openai_api_key = config.get('API', 'openai_api_key', fallback=None)
249
- logging.debug(
250
- f"Loaded OpenAI API Key: {openai_api_key[:5]}...{openai_api_key[-5:] if openai_api_key else None}")
251
-
252
- huggingface_api_key = config.get('API', 'huggingface_api_key', fallback=None)
253
- logging.debug(
254
- f"Loaded HuggingFace API Key: {huggingface_api_key[:5]}...{huggingface_api_key[-5:] if huggingface_api_key else None}")
255
-
256
- openrouter_api_key = config.get('API', 'openrouter_api_key', fallback=None)
257
- logging.debug(
258
- f"Loaded OpenRouter API Key: {openrouter_api_key[:5]}...{openrouter_api_key[-5:] if openrouter_api_key else None}")
259
-
260
- deepseek_api_key = config.get('API', 'deepseek_api_key', fallback=None)
261
- logging.debug(
262
- f"Loaded DeepSeek API Key: {deepseek_api_key[:5]}...{deepseek_api_key[-5:] if deepseek_api_key else None}")
263
-
264
- # Models
265
- anthropic_model = config.get('API', 'anthropic_model', fallback='claude-3-sonnet-20240229')
266
- cohere_model = config.get('API', 'cohere_model', fallback='command-r-plus')
267
- groq_model = config.get('API', 'groq_model', fallback='llama3-70b-8192')
268
- openai_model = config.get('API', 'openai_model', fallback='gpt-4-turbo')
269
- huggingface_model = config.get('API', 'huggingface_model', fallback='CohereForAI/c4ai-command-r-plus')
270
- openrouter_model = config.get('API', 'openrouter_model', fallback='microsoft/wizardlm-2-8x22b')
271
- deepseek_model = config.get('API', 'deepseek_model', fallback='deepseek-chat')
272
-
273
- logging.debug(f"Loaded Anthropic Model: {anthropic_model}")
274
- logging.debug(f"Loaded Cohere Model: {cohere_model}")
275
- logging.debug(f"Loaded Groq Model: {groq_model}")
276
- logging.debug(f"Loaded OpenAI Model: {openai_model}")
277
- logging.debug(f"Loaded HuggingFace Model: {huggingface_model}")
278
- logging.debug(f"Loaded OpenRouter Model: {openrouter_model}")
279
-
280
- # Local-Models
281
- kobold_api_ip = config.get('Local-API', 'kobold_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
282
- kobold_api_key = config.get('Local-API', 'kobold_api_key', fallback='')
283
-
284
- llama_api_IP = config.get('Local-API', 'llama_api_IP', fallback='http://127.0.0.1:8080/v1/chat/completions')
285
- llama_api_key = config.get('Local-API', 'llama_api_key', fallback='')
286
-
287
- ooba_api_IP = config.get('Local-API', 'ooba_api_IP', fallback='http://127.0.0.1:5000/v1/chat/completions')
288
- ooba_api_key = config.get('Local-API', 'ooba_api_key', fallback='')
289
-
290
- tabby_api_IP = config.get('Local-API', 'tabby_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
291
- tabby_api_key = config.get('Local-API', 'tabby_api_key', fallback=None)
292
- tabby_model = config.get('models', 'tabby_model', fallback=None)
293
-
294
- vllm_api_url = config.get('Local-API', 'vllm_api_IP', fallback='http://127.0.0.1:500/api/v1/chat/completions')
295
- vllm_api_key = config.get('Local-API', 'vllm_api_key', fallback=None)
296
- vllm_model = config.get('Local-API', 'vllm_model', fallback=None)
297
-
298
- logging.debug(f"Loaded Kobold API IP: {kobold_api_ip}")
299
- logging.debug(f"Loaded Llama API IP: {llama_api_IP}")
300
- logging.debug(f"Loaded Ooba API IP: {ooba_api_IP}")
301
- logging.debug(f"Loaded Tabby API IP: {tabby_api_IP}")
302
- logging.debug(f"Loaded VLLM API URL: {vllm_api_url}")
303
-
304
- # Retrieve output paths from the configuration file
305
- output_path = config.get('Paths', 'output_path', fallback='results')
306
- logging.debug(f"Output path set to: {output_path}")
307
-
308
- # Retrieve processing choice from the configuration file
309
- processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
310
- logging.debug(f"Processing choice set to: {processing_choice}")
311
-
312
- # Prompts - FIXME
313
- prompt_path = config.get('Prompts', 'prompt_path', fallback='prompts.db')
314
-
315
- return {
316
- 'api_keys': {
317
- 'anthropic': anthropic_api_key,
318
- 'cohere': cohere_api_key,
319
- 'groq': groq_api_key,
320
- 'openai': openai_api_key,
321
- 'huggingface': huggingface_api_key,
322
- 'openrouter': openrouter_api_key,
323
- 'deepseek': deepseek_api_key,
324
- 'kobold': kobold_api_key,
325
- 'llama': llama_api_key,
326
- 'ooba': ooba_api_key,
327
- 'tabby': tabby_api_key,
328
- 'vllm': vllm_api_key
329
- },
330
- 'models': {
331
- 'anthropic': anthropic_model,
332
- 'cohere': cohere_model,
333
- 'groq': groq_model,
334
- 'openai': openai_model,
335
- 'huggingface': huggingface_model,
336
- 'openrouter': openrouter_model,
337
- 'deepseek': deepseek_model,
338
- 'vllm': vllm_model,
339
- 'tabby': tabby_model
340
-
341
- },
342
- 'local_api_ip': {
343
- 'kobold': kobold_api_ip,
344
- 'llama': llama_api_IP,
345
- 'ooba': ooba_api_IP,
346
- 'tabby': tabby_api_IP,
347
- 'vllm': vllm_api_url,
348
- },
349
- 'output_path': output_path,
350
- 'processing_choice': processing_choice
351
- }
352
-
353
- except Exception as e:
354
- logging.error(f"Error loading config: {str(e)}")
355
- return None
356
-
357
-
358
- # Log file
359
- # logging.basicConfig(filename='debug-runtime.log', encoding='utf-8', level=logging.DEBUG)
360
-
361
-
362
- def format_metadata_as_text(metadata):
363
- if not metadata:
364
- return "No metadata available"
365
-
366
- formatted_text = "Video Metadata:\n"
367
- for key, value in metadata.items():
368
- if value is not None:
369
- if isinstance(value, list):
370
- # Join list items with commas
371
- formatted_value = ", ".join(str(item) for item in value)
372
- elif key == 'upload_date' and len(str(value)) == 8:
373
- # Format date as YYYY-MM-DD
374
- formatted_value = f"{value[:4]}-{value[4:6]}-{value[6:]}"
375
- elif key in ['view_count', 'like_count']:
376
- # Format large numbers with commas
377
- formatted_value = f"{value:,}"
378
- elif key == 'duration':
379
- # Convert seconds to HH:MM:SS format
380
- hours, remainder = divmod(value, 3600)
381
- minutes, seconds = divmod(remainder, 60)
382
- formatted_value = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
383
- else:
384
- formatted_value = str(value)
385
-
386
- formatted_text += f"{key.capitalize()}: {formatted_value}\n"
387
- return formatted_text.strip()
388
-
389
- # # Example usage:
390
- # example_metadata = {
391
- # 'title': 'Sample Video Title',
392
- # 'uploader': 'Channel Name',
393
- # 'upload_date': '20230615',
394
- # 'view_count': 1000000,
395
- # 'like_count': 50000,
396
- # 'duration': 3725, # 1 hour, 2 minutes, 5 seconds
397
- # 'tags': ['tag1', 'tag2', 'tag3'],
398
- # 'description': 'This is a sample video description.'
399
- # }
400
- #
401
- # print(format_metadata_as_text(example_metadata))
402
-
403
-
404
- def convert_to_seconds(time_str):
405
- if not time_str:
406
- return 0
407
-
408
- # If it's already a number, assume it's in seconds
409
- if time_str.isdigit():
410
- return int(time_str)
411
-
412
- # Parse time string in format HH:MM:SS, MM:SS, or SS
413
- time_parts = time_str.split(':')
414
- if len(time_parts) == 3:
415
- return int(timedelta(hours=int(time_parts[0]),
416
- minutes=int(time_parts[1]),
417
- seconds=int(time_parts[2])).total_seconds())
418
- elif len(time_parts) == 2:
419
- return int(timedelta(minutes=int(time_parts[0]),
420
- seconds=int(time_parts[1])).total_seconds())
421
- elif len(time_parts) == 1:
422
- return int(time_parts[0])
423
- else:
424
- raise ValueError(f"Invalid time format: {time_str}")
425
-
426
-
427
- def save_to_file(video_urls, filename):
428
- with open(filename, 'w') as file:
429
- file.write('\n'.join(video_urls))
430
- print(f"Video URLs saved to {filename}")
431
-
432
-
433
- def save_segments_to_json(segments, file_name="transcription_segments.json"):
434
- """
435
- Save transcription segments to a JSON file.
436
-
437
- Parameters:
438
- segments (list): List of transcription segments
439
- file_name (str): Name of the JSON file to save (default: "transcription_segments.json")
440
-
441
- Returns:
442
- str: Path to the saved JSON file
443
- """
444
- # Ensure the Results directory exists
445
- os.makedirs("Results", exist_ok=True)
446
-
447
- # Full path for the JSON file
448
- json_file_path = os.path.join("Results", file_name)
449
-
450
- # Save segments to JSON file
451
- with open(json_file_path, 'w', encoding='utf-8') as json_file:
452
- json.dump(segments, json_file, ensure_ascii=False, indent=4)
453
-
454
- return json_file_path
455
-
456
- #
457
- #
458
- #######################################################################################################################
459
- #
460
- # Backup code
461
-
462
- #
463
- # End of backup code
464
- #######################################################################################################################
465
-
466
-
467
-
468
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Utils.py
2
+ #########################################
3
+ # General Utilities Library
4
+ # This library is used to hold random utilities used by various other libraries.
5
+ #
6
+ ####
7
+ ####################
8
+ # Function List
9
+ #
10
+ # 1. extract_text_from_segments(segments: List[Dict]) -> str
11
+ # 2. download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5)
12
+ # 3. verify_checksum(file_path, expected_checksum)
13
+ # 4. create_download_directory(title)
14
+ # 5. sanitize_filename(filename)
15
+ # 6. normalize_title(title)
16
+ # 7.
17
+ #
18
+ #
19
+ #
20
+ ####################
21
+ # Import necessary libraries
22
+ import configparser
23
+ import hashlib
24
+ import json
25
+ import logging
26
+ import os
27
+ import re
28
+ import time
29
+ from datetime import timedelta
30
+ from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
31
+
32
+ import requests
33
+ import unicodedata
34
+ from tqdm import tqdm
35
+
36
+ from App_Function_Libraries.Video_DL_Ingestion_Lib import get_youtube
37
+
38
+
39
+ #######################################################################################################################
40
+ # Function Definitions
41
+ #
42
+
43
+ def extract_text_from_segments(segments):
44
+ logging.debug(f"Segments received: {segments}")
45
+ logging.debug(f"Type of segments: {type(segments)}")
46
+
47
+ def extract_text_recursive(data):
48
+ if isinstance(data, dict):
49
+ for key, value in data.items():
50
+ if key == 'Text':
51
+ return value
52
+ elif isinstance(value, (dict, list)):
53
+ result = extract_text_recursive(value)
54
+ if result:
55
+ return result
56
+ elif isinstance(data, list):
57
+ return ' '.join(filter(None, [extract_text_recursive(item) for item in data]))
58
+ return None
59
+
60
+ text = extract_text_recursive(segments)
61
+
62
+ if text:
63
+ return text.strip()
64
+ else:
65
+ logging.error(f"Unable to extract text from segments: {segments}")
66
+ return "Error: Unable to extract transcription"
67
+
68
+
69
+ def download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5):
70
+ temp_path = dest_path + '.tmp'
71
+
72
+ for attempt in range(max_retries):
73
+ try:
74
+ # Check if a partial download exists and get its size
75
+ resume_header = {}
76
+ if os.path.exists(temp_path):
77
+ resume_header = {'Range': f'bytes={os.path.getsize(temp_path)}-'}
78
+
79
+ response = requests.get(url, stream=True, headers=resume_header)
80
+ response.raise_for_status()
81
+
82
+ # Get the total file size from headers
83
+ total_size = int(response.headers.get('content-length', 0))
84
+ initial_pos = os.path.getsize(temp_path) if os.path.exists(temp_path) else 0
85
+
86
+ mode = 'ab' if 'Range' in response.headers else 'wb'
87
+ with open(temp_path, mode) as temp_file, tqdm(
88
+ total=total_size, unit='B', unit_scale=True, desc=dest_path, initial=initial_pos, ascii=True
89
+ ) as pbar:
90
+ for chunk in response.iter_content(chunk_size=8192):
91
+ if chunk: # filter out keep-alive new chunks
92
+ temp_file.write(chunk)
93
+ pbar.update(len(chunk))
94
+
95
+ # Verify the checksum if provided
96
+ if expected_checksum:
97
+ if not verify_checksum(temp_path, expected_checksum):
98
+ os.remove(temp_path)
99
+ raise ValueError("Downloaded file's checksum does not match the expected checksum")
100
+
101
+ # Move the file to the final destination
102
+ os.rename(temp_path, dest_path)
103
+ print("Download complete and verified!")
104
+ return dest_path
105
+
106
+ except Exception as e:
107
+ print(f"Attempt {attempt + 1} failed: {e}")
108
+ if attempt < max_retries - 1:
109
+ print(f"Retrying in {delay} seconds...")
110
+ time.sleep(delay)
111
+ else:
112
+ print("Max retries reached. Download failed.")
113
+ raise
114
+
115
+
116
+ def verify_checksum(file_path, expected_checksum):
117
+ sha256_hash = hashlib.sha256()
118
+ with open(file_path, 'rb') as f:
119
+ for byte_block in iter(lambda: f.read(4096), b''):
120
+ sha256_hash.update(byte_block)
121
+ return sha256_hash.hexdigest() == expected_checksum
122
+
123
+
124
+ def create_download_directory(title):
125
+ base_dir = "Results"
126
+ # Remove characters that are illegal in Windows filenames and normalize
127
+ safe_title = normalize_title(title)
128
+ logging.debug(f"{title} successfully normalized")
129
+ session_path = os.path.join(base_dir, safe_title)
130
+ if not os.path.exists(session_path):
131
+ os.makedirs(session_path, exist_ok=True)
132
+ logging.debug(f"Created directory for downloaded video: {session_path}")
133
+ else:
134
+ logging.debug(f"Directory already exists for downloaded video: {session_path}")
135
+ return session_path
136
+
137
+
138
+ def sanitize_filename(filename):
139
+ # Remove invalid characters and replace spaces with underscores
140
+ sanitized = re.sub(r'[<>:"/\\|?*]', '', filename)
141
+ sanitized = re.sub(r'\s+', ' ', sanitized).strip()
142
+ return sanitized
143
+
144
+
145
+ def normalize_title(title):
146
+ # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
147
+ title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
148
+ title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '').replace('*', '').replace('?',
149
+ '').replace(
150
+ '<', '').replace('>', '').replace('|', '')
151
+ return title
152
+
153
+
154
+ def clean_youtube_url(url):
155
+ parsed_url = urlparse(url)
156
+ query_params = parse_qs(parsed_url.query)
157
+ if 'list' in query_params:
158
+ query_params.pop('list')
159
+ cleaned_query = urlencode(query_params, doseq=True)
160
+ cleaned_url = urlunparse(parsed_url._replace(query=cleaned_query))
161
+ return cleaned_url
162
+
163
+
164
+ def extract_video_info(url):
165
+ info_dict = get_youtube(url)
166
+ title = info_dict.get('title', 'Untitled')
167
+ return info_dict, title
168
+
169
+
170
+ def import_data(file):
171
+ # Implement this function to import data from a file
172
+ pass
173
+
174
+
175
+ def safe_read_file(file_path):
176
+ encodings = ['utf-8', 'utf-16', 'ascii', 'latin-1', 'iso-8859-1', 'cp1252']
177
+ for encoding in encodings:
178
+ try:
179
+ with open(file_path, 'r', encoding=encoding) as file:
180
+ return file.read()
181
+ except UnicodeDecodeError:
182
+ continue
183
+ except FileNotFoundError:
184
+ return f"File not found: {file_path}"
185
+ except Exception as e:
186
+ return f"An error occurred: {e}"
187
+ return f"Unable to decode the file {file_path} with any of the attempted encodings: {encodings}"
188
+
189
+ #
190
+ #
191
+ #######################
192
+ # Temp file cleanup
193
+ #
194
+ # Global list to keep track of downloaded files
195
+ downloaded_files = []
196
+
197
+ def cleanup_downloads():
198
+ """Function to clean up downloaded files when the server exits."""
199
+ for file_path in downloaded_files:
200
+ try:
201
+ if os.path.exists(file_path):
202
+ os.remove(file_path)
203
+ print(f"Cleaned up file: {file_path}")
204
+ except Exception as e:
205
+ print(f"Error cleaning up file {file_path}: {e}")
206
+
207
+ #
208
+ #
209
+ #######################
210
+ # Config loading
211
+ #
212
+
213
+ def load_comprehensive_config():
214
+ # Get the directory of the current script
215
+ current_dir = os.path.dirname(os.path.abspath(__file__))
216
+ # Go up one level to the project root directory
217
+ project_root = os.path.dirname(current_dir)
218
+ # Construct the path to the config file in the project root directory
219
+ config_path = os.path.join(project_root, 'config.txt')
220
+ # Create a ConfigParser object
221
+ config = configparser.ConfigParser()
222
+ # Read the configuration file
223
+ files_read = config.read(config_path)
224
+ if not files_read:
225
+ raise FileNotFoundError(f"Config file not found at {config_path}")
226
+ return config
227
+
228
+
229
+ # FIXME - update to include prompt path in return statement
230
+ def load_and_log_configs():
231
+ try:
232
+ config = load_comprehensive_config()
233
+ if config is None:
234
+ logging.error("Config is None, cannot proceed")
235
+ return None
236
+ # API Keys
237
+ anthropic_api_key = config.get('API', 'anthropic_api_key', fallback=None)
238
+ logging.debug(
239
+ f"Loaded Anthropic API Key: {anthropic_api_key[:5]}...{anthropic_api_key[-5:] if anthropic_api_key else None}")
240
+
241
+ cohere_api_key = config.get('API', 'cohere_api_key', fallback=None)
242
+ logging.debug(
243
+ f"Loaded Cohere API Key: {cohere_api_key[:5]}...{cohere_api_key[-5:] if cohere_api_key else None}")
244
+
245
+ groq_api_key = config.get('API', 'groq_api_key', fallback=None)
246
+ logging.debug(f"Loaded Groq API Key: {groq_api_key[:5]}...{groq_api_key[-5:] if groq_api_key else None}")
247
+
248
+ openai_api_key = config.get('API', 'openai_api_key', fallback=None)
249
+ logging.debug(
250
+ f"Loaded OpenAI API Key: {openai_api_key[:5]}...{openai_api_key[-5:] if openai_api_key else None}")
251
+
252
+ huggingface_api_key = config.get('API', 'huggingface_api_key', fallback=None)
253
+ logging.debug(
254
+ f"Loaded HuggingFace API Key: {huggingface_api_key[:5]}...{huggingface_api_key[-5:] if huggingface_api_key else None}")
255
+
256
+ openrouter_api_key = config.get('API', 'openrouter_api_key', fallback=None)
257
+ logging.debug(
258
+ f"Loaded OpenRouter API Key: {openrouter_api_key[:5]}...{openrouter_api_key[-5:] if openrouter_api_key else None}")
259
+
260
+ deepseek_api_key = config.get('API', 'deepseek_api_key', fallback=None)
261
+ logging.debug(
262
+ f"Loaded DeepSeek API Key: {deepseek_api_key[:5]}...{deepseek_api_key[-5:] if deepseek_api_key else None}")
263
+
264
+ # Models
265
+ anthropic_model = config.get('API', 'anthropic_model', fallback='claude-3-sonnet-20240229')
266
+ cohere_model = config.get('API', 'cohere_model', fallback='command-r-plus')
267
+ groq_model = config.get('API', 'groq_model', fallback='llama3-70b-8192')
268
+ openai_model = config.get('API', 'openai_model', fallback='gpt-4-turbo')
269
+ huggingface_model = config.get('API', 'huggingface_model', fallback='CohereForAI/c4ai-command-r-plus')
270
+ openrouter_model = config.get('API', 'openrouter_model', fallback='microsoft/wizardlm-2-8x22b')
271
+ deepseek_model = config.get('API', 'deepseek_model', fallback='deepseek-chat')
272
+
273
+ logging.debug(f"Loaded Anthropic Model: {anthropic_model}")
274
+ logging.debug(f"Loaded Cohere Model: {cohere_model}")
275
+ logging.debug(f"Loaded Groq Model: {groq_model}")
276
+ logging.debug(f"Loaded OpenAI Model: {openai_model}")
277
+ logging.debug(f"Loaded HuggingFace Model: {huggingface_model}")
278
+ logging.debug(f"Loaded OpenRouter Model: {openrouter_model}")
279
+
280
+ # Local-Models
281
+ kobold_api_ip = config.get('Local-API', 'kobold_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
282
+ kobold_api_key = config.get('Local-API', 'kobold_api_key', fallback='')
283
+
284
+ llama_api_IP = config.get('Local-API', 'llama_api_IP', fallback='http://127.0.0.1:8080/v1/chat/completions')
285
+ llama_api_key = config.get('Local-API', 'llama_api_key', fallback='')
286
+
287
+ ooba_api_IP = config.get('Local-API', 'ooba_api_IP', fallback='http://127.0.0.1:5000/v1/chat/completions')
288
+ ooba_api_key = config.get('Local-API', 'ooba_api_key', fallback='')
289
+
290
+ tabby_api_IP = config.get('Local-API', 'tabby_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
291
+ tabby_api_key = config.get('Local-API', 'tabby_api_key', fallback=None)
292
+ tabby_model = config.get('models', 'tabby_model', fallback=None)
293
+
294
+ vllm_api_url = config.get('Local-API', 'vllm_api_IP', fallback='http://127.0.0.1:500/api/v1/chat/completions')
295
+ vllm_api_key = config.get('Local-API', 'vllm_api_key', fallback=None)
296
+ vllm_model = config.get('Local-API', 'vllm_model', fallback=None)
297
+
298
+ ollama_api_url = config.get('Local-API', 'ollama_api_IP', fallback='http://127.0.0.1:11434/api/generate')
299
+ ollama_api_key = config.get('Local-API', 'ollama_api_key', fallback=None)
300
+ ollama_model = config.get('Local-API', 'ollama_model', fallback=None)
301
+
302
+ logging.debug(f"Loaded Kobold API IP: {kobold_api_ip}")
303
+ logging.debug(f"Loaded Llama API IP: {llama_api_IP}")
304
+ logging.debug(f"Loaded Ooba API IP: {ooba_api_IP}")
305
+ logging.debug(f"Loaded Tabby API IP: {tabby_api_IP}")
306
+ logging.debug(f"Loaded VLLM API URL: {vllm_api_url}")
307
+
308
+ # Retrieve output paths from the configuration file
309
+ output_path = config.get('Paths', 'output_path', fallback='results')
310
+ logging.debug(f"Output path set to: {output_path}")
311
+
312
+ # Retrieve processing choice from the configuration file
313
+ processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
314
+ logging.debug(f"Processing choice set to: {processing_choice}")
315
+
316
+ # Prompts - FIXME
317
+ prompt_path = config.get('Prompts', 'prompt_path', fallback='prompts.db')
318
+
319
+ return {
320
+ 'api_keys': {
321
+ 'anthropic': anthropic_api_key,
322
+ 'cohere': cohere_api_key,
323
+ 'groq': groq_api_key,
324
+ 'openai': openai_api_key,
325
+ 'huggingface': huggingface_api_key,
326
+ 'openrouter': openrouter_api_key,
327
+ 'deepseek': deepseek_api_key,
328
+ 'kobold': kobold_api_key,
329
+ 'llama': llama_api_key,
330
+ 'ooba': ooba_api_key,
331
+ 'tabby': tabby_api_key,
332
+ 'vllm': vllm_api_key,
333
+ 'ollama': ollama_api_key
334
+ },
335
+ 'models': {
336
+ 'anthropic': anthropic_model,
337
+ 'cohere': cohere_model,
338
+ 'groq': groq_model,
339
+ 'openai': openai_model,
340
+ 'huggingface': huggingface_model,
341
+ 'openrouter': openrouter_model,
342
+ 'deepseek': deepseek_model,
343
+ 'vllm': vllm_model,
344
+ 'tabby': tabby_model,
345
+ 'ollama': ollama_model
346
+
347
+ },
348
+ 'local_api_ip': {
349
+ 'kobold': kobold_api_ip,
350
+ 'llama': llama_api_IP,
351
+ 'ooba': ooba_api_IP,
352
+ 'tabby': tabby_api_IP,
353
+ 'vllm': vllm_api_url,
354
+ 'ollama': ollama_api_url
355
+ },
356
+ 'output_path': output_path,
357
+ 'processing_choice': processing_choice
358
+ }
359
+
360
+ except Exception as e:
361
+ logging.error(f"Error loading config: {str(e)}")
362
+ return None
363
+
364
+
365
+ # Log file
366
+ # logging.basicConfig(filename='debug-runtime.log', encoding='utf-8', level=logging.DEBUG)
367
+
368
+
369
+ def format_metadata_as_text(metadata):
370
+ if not metadata:
371
+ return "No metadata available"
372
+
373
+ formatted_text = "Video Metadata:\n"
374
+ for key, value in metadata.items():
375
+ if value is not None:
376
+ if isinstance(value, list):
377
+ # Join list items with commas
378
+ formatted_value = ", ".join(str(item) for item in value)
379
+ elif key == 'upload_date' and len(str(value)) == 8:
380
+ # Format date as YYYY-MM-DD
381
+ formatted_value = f"{value[:4]}-{value[4:6]}-{value[6:]}"
382
+ elif key in ['view_count', 'like_count']:
383
+ # Format large numbers with commas
384
+ formatted_value = f"{value:,}"
385
+ elif key == 'duration':
386
+ # Convert seconds to HH:MM:SS format
387
+ hours, remainder = divmod(value, 3600)
388
+ minutes, seconds = divmod(remainder, 60)
389
+ formatted_value = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
390
+ else:
391
+ formatted_value = str(value)
392
+
393
+ formatted_text += f"{key.capitalize()}: {formatted_value}\n"
394
+ return formatted_text.strip()
395
+
396
+ # # Example usage:
397
+ # example_metadata = {
398
+ # 'title': 'Sample Video Title',
399
+ # 'uploader': 'Channel Name',
400
+ # 'upload_date': '20230615',
401
+ # 'view_count': 1000000,
402
+ # 'like_count': 50000,
403
+ # 'duration': 3725, # 1 hour, 2 minutes, 5 seconds
404
+ # 'tags': ['tag1', 'tag2', 'tag3'],
405
+ # 'description': 'This is a sample video description.'
406
+ # }
407
+ #
408
+ # print(format_metadata_as_text(example_metadata))
409
+
410
+
411
+ def convert_to_seconds(time_str):
412
+ if not time_str:
413
+ return 0
414
+
415
+ # If it's already a number, assume it's in seconds
416
+ if time_str.isdigit():
417
+ return int(time_str)
418
+
419
+ # Parse time string in format HH:MM:SS, MM:SS, or SS
420
+ time_parts = time_str.split(':')
421
+ if len(time_parts) == 3:
422
+ return int(timedelta(hours=int(time_parts[0]),
423
+ minutes=int(time_parts[1]),
424
+ seconds=int(time_parts[2])).total_seconds())
425
+ elif len(time_parts) == 2:
426
+ return int(timedelta(minutes=int(time_parts[0]),
427
+ seconds=int(time_parts[1])).total_seconds())
428
+ elif len(time_parts) == 1:
429
+ return int(time_parts[0])
430
+ else:
431
+ raise ValueError(f"Invalid time format: {time_str}")
432
+
433
+
434
+ def save_to_file(video_urls, filename):
435
+ with open(filename, 'w') as file:
436
+ file.write('\n'.join(video_urls))
437
+ print(f"Video URLs saved to {filename}")
438
+
439
+
440
+ def save_segments_to_json(segments, file_name="transcription_segments.json"):
441
+ """
442
+ Save transcription segments to a JSON file.
443
+
444
+ Parameters:
445
+ segments (list): List of transcription segments
446
+ file_name (str): Name of the JSON file to save (default: "transcription_segments.json")
447
+
448
+ Returns:
449
+ str: Path to the saved JSON file
450
+ """
451
+ # Ensure the Results directory exists
452
+ os.makedirs("Results", exist_ok=True)
453
+
454
+ # Full path for the JSON file
455
+ json_file_path = os.path.join("Results", file_name)
456
+
457
+ # Save segments to JSON file
458
+ with open(json_file_path, 'w', encoding='utf-8') as json_file:
459
+ json.dump(segments, json_file, ensure_ascii=False, indent=4)
460
+
461
+ return json_file_path
462
+
463
+ def generate_unique_filename(base_path, base_filename):
464
+ """Generate a unique filename by appending a counter if necessary."""
465
+ filename = base_filename
466
+ counter = 1
467
+ while os.path.exists(os.path.join(base_path, filename)):
468
+ name, ext = os.path.splitext(base_filename)
469
+ filename = f"{name}_{counter}{ext}"
470
+ counter += 1
471
+ return filename
472
+
473
+
474
+ def generate_unique_identifier(file_path):
475
+ filename = os.path.basename(file_path)
476
+ timestamp = int(time.time())
477
+
478
+ # Generate a hash of the file content
479
+ hasher = hashlib.md5()
480
+ with open(file_path, 'rb') as f:
481
+ buf = f.read()
482
+ hasher.update(buf)
483
+ content_hash = hasher.hexdigest()[:8] # Use first 8 characters of the hash
484
+
485
+ return f"local:{timestamp}:{content_hash}:{filename}"
486
+
487
+ #
488
+ #
489
+ #######################################################################################################################
490
+ #
491
+ # Backup code
492
+
493
+ #
494
+ # End of backup code
495
+ #######################################################################################################################
496
+
497
+
498
+
499
+