oceansweep commited on
Commit
c1944d4
·
verified ·
1 Parent(s): 2544492

Upload Utils.py

Browse files
Files changed (1) hide show
  1. App_Function_Libraries/Utils/Utils.py +678 -615
App_Function_Libraries/Utils/Utils.py CHANGED
@@ -1,615 +1,678 @@
1
- # Utils.py
2
- #########################################
3
- # General Utilities Library
4
- # This library is used to hold random utilities used by various other libraries.
5
- #
6
- ####
7
- ####################
8
- # Function List
9
- #
10
- # 1. extract_text_from_segments(segments: List[Dict]) -> str
11
- # 2. download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5)
12
- # 3. verify_checksum(file_path, expected_checksum)
13
- # 4. create_download_directory(title)
14
- # 5. sanitize_filename(filename)
15
- # 6. normalize_title(title)
16
- # 7.
17
- #
18
- #
19
- #
20
- ####################
21
- # Import necessary libraries
22
- import configparser
23
- import hashlib
24
- import json
25
- import logging
26
- import os
27
- import re
28
- import time
29
- from datetime import timedelta
30
- from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
31
-
32
- import requests
33
- import unicodedata
34
- from tqdm import tqdm
35
-
36
- #######################################################################################################################
37
- # Function Definitions
38
- #
39
-
40
- def extract_text_from_segments(segments):
41
- logging.debug(f"Segments received: {segments}")
42
- logging.debug(f"Type of segments: {type(segments)}")
43
-
44
- def extract_text_recursive(data):
45
- if isinstance(data, dict):
46
- for key, value in data.items():
47
- if key == 'Text':
48
- return value
49
- elif isinstance(value, (dict, list)):
50
- result = extract_text_recursive(value)
51
- if result:
52
- return result
53
- elif isinstance(data, list):
54
- return ' '.join(filter(None, [extract_text_recursive(item) for item in data]))
55
- return None
56
-
57
- text = extract_text_recursive(segments)
58
-
59
- if text:
60
- return text.strip()
61
- else:
62
- logging.error(f"Unable to extract text from segments: {segments}")
63
- return "Error: Unable to extract transcription"
64
-
65
- def import_data(file):
66
- # Implement this function to import data from a file
67
- pass
68
-
69
- #
70
- #
71
- #######################
72
- # Temp file cleanup
73
- #
74
- # Global list to keep track of downloaded files
75
- downloaded_files = []
76
-
77
- def cleanup_downloads():
78
- """Function to clean up downloaded files when the server exits."""
79
- for file_path in downloaded_files:
80
- try:
81
- if os.path.exists(file_path):
82
- os.remove(file_path)
83
- print(f"Cleaned up file: {file_path}")
84
- except Exception as e:
85
- print(f"Error cleaning up file {file_path}: {e}")
86
-
87
- #
88
- #
89
- #######################################################################################################################
90
-
91
-
92
- #######################################################################################################################
93
- # Config loading
94
- #
95
-
96
-
97
- def load_comprehensive_config():
98
- # Get the directory of the current script
99
- current_dir = os.path.dirname(os.path.abspath(__file__))
100
- # Go up two levels to the project root directory
101
- project_root = os.path.dirname(os.path.dirname(current_dir))
102
- # Construct the path to the config file in the project root directory
103
- config_path = os.path.join(project_root, 'config.txt')
104
- # Create a ConfigParser object
105
- config = configparser.ConfigParser()
106
- # Read the configuration file
107
- files_read = config.read(config_path)
108
- if not files_read:
109
- raise FileNotFoundError(f"Config file not found at {config_path}")
110
- return config
111
-
112
-
113
- # FIXME - update to include prompt path in return statement
114
- def load_and_log_configs():
115
- try:
116
- config = load_comprehensive_config()
117
- if config is None:
118
- logging.error("Config is None, cannot proceed")
119
- return None
120
- # API Keys
121
- anthropic_api_key = config.get('API', 'anthropic_api_key', fallback=None)
122
- logging.debug(
123
- f"Loaded Anthropic API Key: {anthropic_api_key[:5]}...{anthropic_api_key[-5:] if anthropic_api_key else None}")
124
-
125
- cohere_api_key = config.get('API', 'cohere_api_key', fallback=None)
126
- logging.debug(
127
- f"Loaded Cohere API Key: {cohere_api_key[:5]}...{cohere_api_key[-5:] if cohere_api_key else None}")
128
-
129
- groq_api_key = config.get('API', 'groq_api_key', fallback=None)
130
- logging.debug(f"Loaded Groq API Key: {groq_api_key[:5]}...{groq_api_key[-5:] if groq_api_key else None}")
131
-
132
- openai_api_key = config.get('API', 'openai_api_key', fallback=None)
133
- logging.debug(
134
- f"Loaded OpenAI API Key: {openai_api_key[:5]}...{openai_api_key[-5:] if openai_api_key else None}")
135
-
136
- huggingface_api_key = config.get('API', 'huggingface_api_key', fallback=None)
137
- logging.debug(
138
- f"Loaded HuggingFace API Key: {huggingface_api_key[:5]}...{huggingface_api_key[-5:] if huggingface_api_key else None}")
139
-
140
- openrouter_api_key = config.get('API', 'openrouter_api_key', fallback=None)
141
- logging.debug(
142
- f"Loaded OpenRouter API Key: {openrouter_api_key[:5]}...{openrouter_api_key[-5:] if openrouter_api_key else None}")
143
-
144
- deepseek_api_key = config.get('API', 'deepseek_api_key', fallback=None)
145
- logging.debug(
146
- f"Loaded DeepSeek API Key: {deepseek_api_key[:5]}...{deepseek_api_key[-5:] if deepseek_api_key else None}")
147
-
148
- mistral_api_key = config.get('API', 'mistral_api_key', fallback=None)
149
- logging.debug(
150
- f"Loaded Mistral API Key: {mistral_api_key[:5]}...{mistral_api_key[-5:] if mistral_api_key else None}")
151
-
152
- # Models
153
- anthropic_model = config.get('API', 'anthropic_model', fallback='claude-3-sonnet-20240229')
154
- cohere_model = config.get('API', 'cohere_model', fallback='command-r-plus')
155
- groq_model = config.get('API', 'groq_model', fallback='llama3-70b-8192')
156
- openai_model = config.get('API', 'openai_model', fallback='gpt-4-turbo')
157
- huggingface_model = config.get('API', 'huggingface_model', fallback='CohereForAI/c4ai-command-r-plus')
158
- openrouter_model = config.get('API', 'openrouter_model', fallback='microsoft/wizardlm-2-8x22b')
159
- deepseek_model = config.get('API', 'deepseek_model', fallback='deepseek-chat')
160
- mistral_model = config.get('API', 'mistral_model', fallback='mistral-large-latest')
161
-
162
- logging.debug(f"Loaded Anthropic Model: {anthropic_model}")
163
- logging.debug(f"Loaded Cohere Model: {cohere_model}")
164
- logging.debug(f"Loaded Groq Model: {groq_model}")
165
- logging.debug(f"Loaded OpenAI Model: {openai_model}")
166
- logging.debug(f"Loaded HuggingFace Model: {huggingface_model}")
167
- logging.debug(f"Loaded OpenRouter Model: {openrouter_model}")
168
- logging.debug(f"Loaded Deepseek Model: {deepseek_model}")
169
- logging.debug(f"Loaded Mistral Model: {mistral_model}")
170
-
171
- # Local-Models
172
- kobold_api_ip = config.get('Local-API', 'kobold_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
173
- kobold_api_key = config.get('Local-API', 'kobold_api_key', fallback='')
174
-
175
- llama_api_IP = config.get('Local-API', 'llama_api_IP', fallback='http://127.0.0.1:8080/v1/chat/completions')
176
- llama_api_key = config.get('Local-API', 'llama_api_key', fallback='')
177
-
178
- ooba_api_IP = config.get('Local-API', 'ooba_api_IP', fallback='http://127.0.0.1:5000/v1/chat/completions')
179
- ooba_api_key = config.get('Local-API', 'ooba_api_key', fallback='')
180
-
181
- tabby_api_IP = config.get('Local-API', 'tabby_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
182
- tabby_api_key = config.get('Local-API', 'tabby_api_key', fallback=None)
183
- tabby_model = config.get('models', 'tabby_model', fallback=None)
184
-
185
- vllm_api_url = config.get('Local-API', 'vllm_api_IP', fallback='http://127.0.0.1:500/api/v1/chat/completions')
186
- vllm_api_key = config.get('Local-API', 'vllm_api_key', fallback=None)
187
- vllm_model = config.get('Local-API', 'vllm_model', fallback=None)
188
-
189
- ollama_api_url = config.get('Local-API', 'ollama_api_IP', fallback='http://127.0.0.1:11434/api/generate')
190
- ollama_api_key = config.get('Local-API', 'ollama_api_key', fallback=None)
191
- ollama_model = config.get('Local-API', 'ollama_model', fallback=None)
192
-
193
- aphrodite_api_url = config.get('Local-API', 'aphrodite_api_IP', fallback='http://127.0.0.1:8080/v1/chat/completions')
194
- aphrodite_api_key = config.get('Local-API', 'aphrodite_api_key', fallback='')
195
-
196
- logging.debug(f"Loaded Kobold API IP: {kobold_api_ip}")
197
- logging.debug(f"Loaded Llama API IP: {llama_api_IP}")
198
- logging.debug(f"Loaded Ooba API IP: {ooba_api_IP}")
199
- logging.debug(f"Loaded Tabby API IP: {tabby_api_IP}")
200
- logging.debug(f"Loaded VLLM API URL: {vllm_api_url}")
201
-
202
- # Retrieve output paths from the configuration file
203
- output_path = config.get('Paths', 'output_path', fallback='results')
204
- logging.debug(f"Output path set to: {output_path}")
205
-
206
- # Retrieve processing choice from the configuration file
207
- processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
208
- logging.debug(f"Processing choice set to: {processing_choice}")
209
-
210
- # Prompts - FIXME
211
- prompt_path = config.get('Prompts', 'prompt_path', fallback='prompts.db')
212
-
213
- return {
214
- 'api_keys': {
215
- 'anthropic': anthropic_api_key,
216
- 'cohere': cohere_api_key,
217
- 'groq': groq_api_key,
218
- 'openai': openai_api_key,
219
- 'huggingface': huggingface_api_key,
220
- 'openrouter': openrouter_api_key,
221
- 'deepseek': deepseek_api_key,
222
- 'mistral': mistral_api_key,
223
- 'kobold': kobold_api_key,
224
- 'llama': llama_api_key,
225
- 'ooba': ooba_api_key,
226
- 'tabby': tabby_api_key,
227
- 'vllm': vllm_api_key,
228
- 'ollama': ollama_api_key
229
- },
230
- 'models': {
231
- 'anthropic': anthropic_model,
232
- 'cohere': cohere_model,
233
- 'groq': groq_model,
234
- 'openai': openai_model,
235
- 'huggingface': huggingface_model,
236
- 'openrouter': openrouter_model,
237
- 'deepseek': deepseek_model,
238
- 'mistral': mistral_model,
239
- 'vllm': vllm_model,
240
- 'tabby': tabby_model,
241
- 'ollama': ollama_model
242
-
243
- },
244
- 'local_api_ip': {
245
- 'kobold': kobold_api_ip,
246
- 'llama': llama_api_IP,
247
- 'ooba': ooba_api_IP,
248
- 'tabby': tabby_api_IP,
249
- 'vllm': vllm_api_url,
250
- 'ollama': ollama_api_url,
251
- 'aphrodite': aphrodite_api_url
252
- },
253
- 'output_path': output_path,
254
- 'processing_choice': processing_choice
255
- }
256
-
257
- except Exception as e:
258
- logging.error(f"Error loading config: {str(e)}")
259
- return None
260
-
261
- #
262
- # End of Config loading
263
- #######################################################################################################################
264
-
265
-
266
- #######################################################################################################################
267
- #
268
- # Prompt Handling Functions
269
-
270
-
271
-
272
- #
273
- # End of Prompt Handling Functions
274
- ### #############################################################################################################
275
-
276
- #######################################################################################################################
277
- #
278
- # Misc-Functions
279
-
280
- # Log file
281
- # logging.basicConfig(filename='debug-runtime.log', encoding='utf-8', level=logging.DEBUG)
282
-
283
- def format_metadata_as_text(metadata):
284
- if not metadata:
285
- return "No metadata available"
286
-
287
- formatted_text = "Video Metadata:\n"
288
- for key, value in metadata.items():
289
- if value is not None:
290
- if isinstance(value, list):
291
- # Join list items with commas
292
- formatted_value = ", ".join(str(item) for item in value)
293
- elif key == 'upload_date' and len(str(value)) == 8:
294
- # Format date as YYYY-MM-DD
295
- formatted_value = f"{value[:4]}-{value[4:6]}-{value[6:]}"
296
- elif key in ['view_count', 'like_count']:
297
- # Format large numbers with commas
298
- formatted_value = f"{value:,}"
299
- elif key == 'duration':
300
- # Convert seconds to HH:MM:SS format
301
- hours, remainder = divmod(value, 3600)
302
- minutes, seconds = divmod(remainder, 60)
303
- formatted_value = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
304
- else:
305
- formatted_value = str(value)
306
-
307
- formatted_text += f"{key.capitalize()}: {formatted_value}\n"
308
- return formatted_text.strip()
309
-
310
- # # Example usage:
311
- # example_metadata = {
312
- # 'title': 'Sample Video Title',
313
- # 'uploader': 'Channel Name',
314
- # 'upload_date': '20230615',
315
- # 'view_count': 1000000,
316
- # 'like_count': 50000,
317
- # 'duration': 3725, # 1 hour, 2 minutes, 5 seconds
318
- # 'tags': ['tag1', 'tag2', 'tag3'],
319
- # 'description': 'This is a sample video description.'
320
- # }
321
- #
322
- # print(format_metadata_as_text(example_metadata))
323
-
324
-
325
- def convert_to_seconds(time_str):
326
- if not time_str:
327
- return 0
328
-
329
- # If it's already a number, assume it's in seconds
330
- if time_str.isdigit():
331
- return int(time_str)
332
-
333
- # Parse time string in format HH:MM:SS, MM:SS, or SS
334
- time_parts = time_str.split(':')
335
- if len(time_parts) == 3:
336
- return int(timedelta(hours=int(time_parts[0]),
337
- minutes=int(time_parts[1]),
338
- seconds=int(time_parts[2])).total_seconds())
339
- elif len(time_parts) == 2:
340
- return int(timedelta(minutes=int(time_parts[0]),
341
- seconds=int(time_parts[1])).total_seconds())
342
- elif len(time_parts) == 1:
343
- return int(time_parts[0])
344
- else:
345
- raise ValueError(f"Invalid time format: {time_str}")
346
-
347
- #
348
- # End of Misc-Functions
349
- #######################################################################################################################
350
-
351
-
352
- #######################################################################################################################
353
- #
354
- # File-saving Function Definitions
355
- def save_to_file(video_urls, filename):
356
- with open(filename, 'w') as file:
357
- file.write('\n'.join(video_urls))
358
- print(f"Video URLs saved to {filename}")
359
-
360
-
361
- def save_segments_to_json(segments, file_name="transcription_segments.json"):
362
- """
363
- Save transcription segments to a JSON file.
364
-
365
- Parameters:
366
- segments (list): List of transcription segments
367
- file_name (str): Name of the JSON file to save (default: "transcription_segments.json")
368
-
369
- Returns:
370
- str: Path to the saved JSON file
371
- """
372
- # Ensure the Results directory exists
373
- os.makedirs("Results", exist_ok=True)
374
-
375
- # Full path for the JSON file
376
- json_file_path = os.path.join("Results", file_name)
377
-
378
- # Save segments to JSON file
379
- with open(json_file_path, 'w', encoding='utf-8') as json_file:
380
- json.dump(segments, json_file, ensure_ascii=False, indent=4)
381
-
382
- return json_file_path
383
-
384
-
385
- def download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5):
386
- temp_path = dest_path + '.tmp'
387
-
388
- for attempt in range(max_retries):
389
- try:
390
- # Check if a partial download exists and get its size
391
- resume_header = {}
392
- if os.path.exists(temp_path):
393
- resume_header = {'Range': f'bytes={os.path.getsize(temp_path)}-'}
394
-
395
- response = requests.get(url, stream=True, headers=resume_header)
396
- response.raise_for_status()
397
-
398
- # Get the total file size from headers
399
- total_size = int(response.headers.get('content-length', 0))
400
- initial_pos = os.path.getsize(temp_path) if os.path.exists(temp_path) else 0
401
-
402
- mode = 'ab' if 'Range' in response.headers else 'wb'
403
- with open(temp_path, mode) as temp_file, tqdm(
404
- total=total_size, unit='B', unit_scale=True, desc=dest_path, initial=initial_pos, ascii=True
405
- ) as pbar:
406
- for chunk in response.iter_content(chunk_size=8192):
407
- if chunk: # filter out keep-alive new chunks
408
- temp_file.write(chunk)
409
- pbar.update(len(chunk))
410
-
411
- # Verify the checksum if provided
412
- if expected_checksum:
413
- if not verify_checksum(temp_path, expected_checksum):
414
- os.remove(temp_path)
415
- raise ValueError("Downloaded file's checksum does not match the expected checksum")
416
-
417
- # Move the file to the final destination
418
- os.rename(temp_path, dest_path)
419
- print("Download complete and verified!")
420
- return dest_path
421
-
422
- except Exception as e:
423
- print(f"Attempt {attempt + 1} failed: {e}")
424
- if attempt < max_retries - 1:
425
- print(f"Retrying in {delay} seconds...")
426
- time.sleep(delay)
427
- else:
428
- print("Max retries reached. Download failed.")
429
- raise
430
-
431
- def create_download_directory(title):
432
- base_dir = "Results"
433
- # Remove characters that are illegal in Windows filenames and normalize
434
- safe_title = normalize_title(title)
435
- logging.debug(f"{title} successfully normalized")
436
- session_path = os.path.join(base_dir, safe_title)
437
- if not os.path.exists(session_path):
438
- os.makedirs(session_path, exist_ok=True)
439
- logging.debug(f"Created directory for downloaded video: {session_path}")
440
- else:
441
- logging.debug(f"Directory already exists for downloaded video: {session_path}")
442
- return session_path
443
-
444
-
445
- def safe_read_file(file_path):
446
- encodings = ['utf-8', 'utf-16', 'ascii', 'latin-1', 'iso-8859-1', 'cp1252']
447
- for encoding in encodings:
448
- try:
449
- with open(file_path, 'r', encoding=encoding) as file:
450
- return file.read()
451
- except UnicodeDecodeError:
452
- continue
453
- except FileNotFoundError:
454
- return f"File not found: {file_path}"
455
- except Exception as e:
456
- return f"An error occurred: {e}"
457
- return f"Unable to decode the file {file_path} with any of the attempted encodings: {encodings}"
458
-
459
- #
460
- # End of Files-saving Function Definitions
461
- #######################################################################################################################
462
-
463
-
464
- #######################################################################################################################
465
- #
466
- # UUID-Functions
467
-
468
- def generate_unique_filename(base_path, base_filename):
469
- """Generate a unique filename by appending a counter if necessary."""
470
- filename = base_filename
471
- counter = 1
472
- while os.path.exists(os.path.join(base_path, filename)):
473
- name, ext = os.path.splitext(base_filename)
474
- filename = f"{name}_{counter}{ext}"
475
- counter += 1
476
- return filename
477
-
478
-
479
- def generate_unique_identifier(file_path):
480
- filename = os.path.basename(file_path)
481
- timestamp = int(time.time())
482
-
483
- # Generate a hash of the file content
484
- hasher = hashlib.md5()
485
- with open(file_path, 'rb') as f:
486
- buf = f.read()
487
- hasher.update(buf)
488
- content_hash = hasher.hexdigest()[:8] # Use first 8 characters of the hash
489
-
490
- return f"local:{timestamp}:{content_hash}:{filename}"
491
-
492
- #
493
- # End of UUID-Functions
494
- #######################################################################################################################
495
-
496
-
497
- #######################################################################################################################
498
- #
499
- # Backup code
500
-
501
- #
502
- # End of backup code
503
- #######################################################################################################################
504
-
505
-
506
- #######################################################################################################################
507
- #
508
- # Sanitization/Verification Functions
509
-
510
- # Helper function to validate URL format
511
- def is_valid_url(url: str) -> bool:
512
- regex = re.compile(
513
- r'^(?:http|ftp)s?://' # http:// or https://
514
- r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
515
- r'localhost|' # localhost...
516
- r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4
517
- r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6
518
- r'(?::\d+)?' # optional port
519
- r'(?:/?|[/?]\S+)$', re.IGNORECASE)
520
- return re.match(regex, url) is not None
521
-
522
-
523
- def verify_checksum(file_path, expected_checksum):
524
- sha256_hash = hashlib.sha256()
525
- with open(file_path, 'rb') as f:
526
- for byte_block in iter(lambda: f.read(4096), b''):
527
- sha256_hash.update(byte_block)
528
- return sha256_hash.hexdigest() == expected_checksum
529
-
530
-
531
- def normalize_title(title):
532
- # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
533
- title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
534
- title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '').replace('*', '').replace('?',
535
- '').replace(
536
- '<', '').replace('>', '').replace('|', '')
537
- return title
538
-
539
-
540
- def clean_youtube_url(url):
541
- parsed_url = urlparse(url)
542
- query_params = parse_qs(parsed_url.query)
543
- if 'list' in query_params:
544
- query_params.pop('list')
545
- cleaned_query = urlencode(query_params, doseq=True)
546
- cleaned_url = urlunparse(parsed_url._replace(query=cleaned_query))
547
- return cleaned_url
548
-
549
- def sanitize_filename(filename):
550
- # Remove invalid characters and replace spaces with underscores
551
- sanitized = re.sub(r'[<>:"/\\|?*]', '', filename)
552
- sanitized = re.sub(r'\s+', ' ', sanitized).strip()
553
- return sanitized
554
-
555
-
556
- def format_transcription(content):
557
- # Replace '\n' with actual line breaks
558
- content = content.replace('\\n', '\n')
559
- # Split the content by newlines first
560
- lines = content.split('\n')
561
- formatted_lines = []
562
- for line in lines:
563
- # Add extra space after periods for better readability
564
- line = line.replace('.', '. ').replace('. ', '. ')
565
-
566
- # Split into sentences using a more comprehensive regex
567
- sentences = re.split('(?<=[.!?]) +', line)
568
-
569
- # Trim whitespace from each sentence and add a line break
570
- formatted_sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
571
-
572
- # Join the formatted sentences
573
- formatted_lines.append(' '.join(formatted_sentences))
574
-
575
- # Join the lines with HTML line breaks
576
- formatted_content = '<br>'.join(formatted_lines)
577
-
578
- return formatted_content
579
-
580
-
581
- def format_file_path(file_path, fallback_path=None):
582
- if file_path and os.path.exists(file_path):
583
- logging.debug(f"File exists: {file_path}")
584
- return file_path
585
- elif fallback_path and os.path.exists(fallback_path):
586
- logging.debug(f"File does not exist: {file_path}. Returning fallback path: {fallback_path}")
587
- return fallback_path
588
- else:
589
- logging.debug(f"File does not exist: {file_path}. No fallback path available.")
590
- return None
591
-
592
- #
593
- # End of Sanitization/Verification Functions
594
- #######################################################################################################################
595
-
596
-
597
- #######################################################################################################################
598
- #
599
- # DB Config Loading
600
-
601
-
602
- def get_db_config():
603
- config = configparser.ConfigParser()
604
- config.read('config.txt')
605
- return {
606
- 'type': config['Database']['type'],
607
- 'sqlite_path': config.get('Database', 'sqlite_path', fallback='media_summary.db'),
608
- 'elasticsearch_host': config.get('Database', 'elasticsearch_host', fallback='localhost'),
609
- 'elasticsearch_port': config.getint('Database', 'elasticsearch_port', fallback=9200)
610
- }
611
-
612
-
613
- #
614
- # End of DB Config Loading
615
- #######################################################################################################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Utils.py
2
+ #########################################
3
+ # General Utilities Library
4
+ # This library is used to hold random utilities used by various other libraries.
5
+ #
6
+ ####
7
+ ####################
8
+ # Function List
9
+ #
10
+ # 1. extract_text_from_segments(segments: List[Dict]) -> str
11
+ # 2. download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5)
12
+ # 3. verify_checksum(file_path, expected_checksum)
13
+ # 4. create_download_directory(title)
14
+ # 5. sanitize_filename(filename)
15
+ # 6. normalize_title(title)
16
+ # 7.
17
+ #
18
+ #
19
+ #
20
+ ####################
21
+ # Import necessary libraries
22
+ import configparser
23
+ import hashlib
24
+ import json
25
+ import logging
26
+ import os
27
+ import re
28
+ import time
29
+ from datetime import timedelta
30
+ from typing import Union, AnyStr
31
+ from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
32
+
33
+ import requests
34
+ import unicodedata
35
+ from tqdm import tqdm
36
+
37
+ #######################################################################################################################
38
+ # Function Definitions
39
+ #
40
+
41
+ def extract_text_from_segments(segments):
42
+ logging.debug(f"Segments received: {segments}")
43
+ logging.debug(f"Type of segments: {type(segments)}")
44
+
45
+ def extract_text_recursive(data):
46
+ if isinstance(data, dict):
47
+ for key, value in data.items():
48
+ if key == 'Text':
49
+ return value
50
+ elif isinstance(value, (dict, list)):
51
+ result = extract_text_recursive(value)
52
+ if result:
53
+ return result
54
+ elif isinstance(data, list):
55
+ return ' '.join(filter(None, [extract_text_recursive(item) for item in data]))
56
+ return None
57
+
58
+ text = extract_text_recursive(segments)
59
+
60
+ if text:
61
+ return text.strip()
62
+ else:
63
+ logging.error(f"Unable to extract text from segments: {segments}")
64
+ return "Error: Unable to extract transcription"
65
+
66
+ def import_data(file):
67
+ # Implement this function to import data from a file
68
+ pass
69
+
70
+ #
71
+ #
72
+ #######################
73
+ # Temp file cleanup
74
+ #
75
+ # Global list to keep track of downloaded files
76
+ downloaded_files = []
77
+
78
+ def cleanup_downloads():
79
+ """Function to clean up downloaded files when the server exits."""
80
+ for file_path in downloaded_files:
81
+ try:
82
+ if os.path.exists(file_path):
83
+ os.remove(file_path)
84
+ print(f"Cleaned up file: {file_path}")
85
+ except Exception as e:
86
+ print(f"Error cleaning up file {file_path}: {e}")
87
+
88
+ #
89
+ #
90
+ #######################################################################################################################
91
+
92
+
93
+ #######################################################################################################################
94
+ # Config loading
95
+ #
96
+
97
+
98
+ def load_comprehensive_config():
99
+ # Get the directory of the current script (Utils.py)
100
+ current_dir = os.path.dirname(os.path.abspath(__file__))
101
+ logging.debug(f"Current directory: {current_dir}")
102
+
103
+ # Go up two levels to the project root directory (tldw)
104
+ project_root = os.path.dirname(os.path.dirname(current_dir))
105
+ logging.debug(f"Project root directory: {project_root}")
106
+
107
+ # Construct the path to the config file
108
+ config_path = os.path.join(project_root, 'Config_Files', 'config.txt')
109
+ logging.debug(f"Config file path: {config_path}")
110
+
111
+ # Check if the config file exists
112
+ if not os.path.exists(config_path):
113
+ logging.error(f"Config file not found at {config_path}")
114
+ raise FileNotFoundError(f"Config file not found at {config_path}")
115
+
116
+ # Read the config file
117
+ config = configparser.ConfigParser()
118
+ config.read(config_path)
119
+
120
+ # Log the sections found in the config file
121
+ logging.debug("load_comprehensive_config(): Sections found in config: {config.sections()}")
122
+
123
+ return config
124
+
125
+
126
+ def get_project_root():
127
+ """Get the project root directory."""
128
+ return os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
129
+
130
+ def get_database_dir():
131
+ return get_project_relative_path('Databases')
132
+
133
+ def get_database_path(db_name: Union[str, os.PathLike[AnyStr]]) -> str:
134
+ """Get the full path for a database file."""
135
+ return os.path.join(get_database_dir(), str(db_name))
136
+
137
+ def get_project_relative_path(relative_path: Union[str, os.PathLike[AnyStr]]) -> str:
138
+ """Convert a relative path to a path relative to the project root."""
139
+ return os.path.join(get_project_root(), str(relative_path))
140
+
141
+ def get_chromadb_path():
142
+ project_root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
143
+ return os.path.join(project_root, 'Databases', 'chroma_db')
144
+
145
+ def ensure_directory_exists(path):
146
+ """Ensure that a directory exists, creating it if necessary."""
147
+ os.makedirs(path, exist_ok=True)
148
+
149
+ # FIXME - update to include prompt path in return statement
150
+ def load_and_log_configs():
151
+ try:
152
+ config = load_comprehensive_config()
153
+ if config is None:
154
+ logging.error("Config is None, cannot proceed")
155
+ return None
156
+ # API Keys
157
+ anthropic_api_key = config.get('API', 'anthropic_api_key', fallback=None)
158
+ logging.debug(
159
+ f"Loaded Anthropic API Key: {anthropic_api_key[:5]}...{anthropic_api_key[-5:] if anthropic_api_key else None}")
160
+
161
+ cohere_api_key = config.get('API', 'cohere_api_key', fallback=None)
162
+ logging.debug(
163
+ f"Loaded Cohere API Key: {cohere_api_key[:5]}...{cohere_api_key[-5:] if cohere_api_key else None}")
164
+
165
+ groq_api_key = config.get('API', 'groq_api_key', fallback=None)
166
+ logging.debug(f"Loaded Groq API Key: {groq_api_key[:5]}...{groq_api_key[-5:] if groq_api_key else None}")
167
+
168
+ openai_api_key = config.get('API', 'openai_api_key', fallback=None)
169
+ logging.debug(
170
+ f"Loaded OpenAI API Key: {openai_api_key[:5]}...{openai_api_key[-5:] if openai_api_key else None}")
171
+
172
+ huggingface_api_key = config.get('API', 'huggingface_api_key', fallback=None)
173
+ logging.debug(
174
+ f"Loaded HuggingFace API Key: {huggingface_api_key[:5]}...{huggingface_api_key[-5:] if huggingface_api_key else None}")
175
+
176
+ openrouter_api_key = config.get('API', 'openrouter_api_key', fallback=None)
177
+ logging.debug(
178
+ f"Loaded OpenRouter API Key: {openrouter_api_key[:5]}...{openrouter_api_key[-5:] if openrouter_api_key else None}")
179
+
180
+ deepseek_api_key = config.get('API', 'deepseek_api_key', fallback=None)
181
+ logging.debug(
182
+ f"Loaded DeepSeek API Key: {deepseek_api_key[:5]}...{deepseek_api_key[-5:] if deepseek_api_key else None}")
183
+
184
+ mistral_api_key = config.get('API', 'mistral_api_key', fallback=None)
185
+ logging.debug(
186
+ f"Loaded Mistral API Key: {mistral_api_key[:5]}...{mistral_api_key[-5:] if mistral_api_key else None}")
187
+
188
+ # Models
189
+ anthropic_model = config.get('API', 'anthropic_model', fallback='claude-3-sonnet-20240229')
190
+ cohere_model = config.get('API', 'cohere_model', fallback='command-r-plus')
191
+ groq_model = config.get('API', 'groq_model', fallback='llama3-70b-8192')
192
+ openai_model = config.get('API', 'openai_model', fallback='gpt-4-turbo')
193
+ huggingface_model = config.get('API', 'huggingface_model', fallback='CohereForAI/c4ai-command-r-plus')
194
+ openrouter_model = config.get('API', 'openrouter_model', fallback='microsoft/wizardlm-2-8x22b')
195
+ deepseek_model = config.get('API', 'deepseek_model', fallback='deepseek-chat')
196
+ mistral_model = config.get('API', 'mistral_model', fallback='mistral-large-latest')
197
+
198
+ logging.debug(f"Loaded Anthropic Model: {anthropic_model}")
199
+ logging.debug(f"Loaded Cohere Model: {cohere_model}")
200
+ logging.debug(f"Loaded Groq Model: {groq_model}")
201
+ logging.debug(f"Loaded OpenAI Model: {openai_model}")
202
+ logging.debug(f"Loaded HuggingFace Model: {huggingface_model}")
203
+ logging.debug(f"Loaded OpenRouter Model: {openrouter_model}")
204
+ logging.debug(f"Loaded Deepseek Model: {deepseek_model}")
205
+ logging.debug(f"Loaded Mistral Model: {mistral_model}")
206
+
207
+ # Local-Models
208
+ kobold_api_ip = config.get('Local-API', 'kobold_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
209
+ kobold_api_key = config.get('Local-API', 'kobold_api_key', fallback='')
210
+
211
+ llama_api_IP = config.get('Local-API', 'llama_api_IP', fallback='http://127.0.0.1:8080/v1/chat/completions')
212
+ llama_api_key = config.get('Local-API', 'llama_api_key', fallback='')
213
+
214
+ ooba_api_IP = config.get('Local-API', 'ooba_api_IP', fallback='http://127.0.0.1:5000/v1/chat/completions')
215
+ ooba_api_key = config.get('Local-API', 'ooba_api_key', fallback='')
216
+
217
+ tabby_api_IP = config.get('Local-API', 'tabby_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
218
+ tabby_api_key = config.get('Local-API', 'tabby_api_key', fallback=None)
219
+ tabby_model = config.get('models', 'tabby_model', fallback=None)
220
+
221
+ vllm_api_url = config.get('Local-API', 'vllm_api_IP', fallback='http://127.0.0.1:500/api/v1/chat/completions')
222
+ vllm_api_key = config.get('Local-API', 'vllm_api_key', fallback=None)
223
+ vllm_model = config.get('Local-API', 'vllm_model', fallback=None)
224
+
225
+ ollama_api_url = config.get('Local-API', 'ollama_api_IP', fallback='http://127.0.0.1:11434/api/generate')
226
+ ollama_api_key = config.get('Local-API', 'ollama_api_key', fallback=None)
227
+ ollama_model = config.get('Local-API', 'ollama_model', fallback=None)
228
+
229
+ aphrodite_api_url = config.get('Local-API', 'aphrodite_api_IP', fallback='http://127.0.0.1:8080/v1/chat/completions')
230
+ aphrodite_api_key = config.get('Local-API', 'aphrodite_api_key', fallback='')
231
+
232
+ custom_openai_api_key = config.get('API', 'custom_openai_api_key', fallback=None)
233
+ custom_openai_api_url = config.get('API', 'custom_openai_url', fallback=None)
234
+ logging.debug(
235
+ f"Loaded Custom openai-like endpoint API Key: {custom_openai_api_key[:5]}...{custom_openai_api_key[-5:] if custom_openai_api_key else None}")
236
+
237
+ logging.debug(f"Loaded Kobold API IP: {kobold_api_ip}")
238
+ logging.debug(f"Loaded Llama API IP: {llama_api_IP}")
239
+ logging.debug(f"Loaded Ooba API IP: {ooba_api_IP}")
240
+ logging.debug(f"Loaded Tabby API IP: {tabby_api_IP}")
241
+ logging.debug(f"Loaded VLLM API URL: {vllm_api_url}")
242
+
243
+ # Retrieve output paths from the configuration file
244
+ output_path = config.get('Paths', 'output_path', fallback='results')
245
+ logging.debug(f"Output path set to: {output_path}")
246
+
247
+ # Retrieve processing choice from the configuration file
248
+ processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
249
+ logging.debug(f"Processing choice set to: {processing_choice}")
250
+
251
+ # Prompts - FIXME
252
+ prompt_path = config.get('Prompts', 'prompt_path', fallback='Databases/prompts.db')
253
+
254
+ return {
255
+ 'api_keys': {
256
+ 'anthropic': anthropic_api_key,
257
+ 'cohere': cohere_api_key,
258
+ 'groq': groq_api_key,
259
+ 'openai': openai_api_key,
260
+ 'huggingface': huggingface_api_key,
261
+ 'openrouter': openrouter_api_key,
262
+ 'deepseek': deepseek_api_key,
263
+ 'mistral': mistral_api_key,
264
+ 'kobold': kobold_api_key,
265
+ 'llama': llama_api_key,
266
+ 'ooba': ooba_api_key,
267
+ 'tabby': tabby_api_key,
268
+ 'vllm': vllm_api_key,
269
+ 'ollama': ollama_api_key,
270
+ 'aphrodite': aphrodite_api_key,
271
+ 'custom_openai_api_key': custom_openai_api_key
272
+ },
273
+ 'models': {
274
+ 'anthropic': anthropic_model,
275
+ 'cohere': cohere_model,
276
+ 'groq': groq_model,
277
+ 'openai': openai_model,
278
+ 'huggingface': huggingface_model,
279
+ 'openrouter': openrouter_model,
280
+ 'deepseek': deepseek_model,
281
+ 'mistral': mistral_model,
282
+ 'vllm': vllm_model,
283
+ 'tabby': tabby_model,
284
+ 'ollama': ollama_model
285
+
286
+ },
287
+ 'local_api_ip': {
288
+ 'kobold': kobold_api_ip,
289
+ 'llama': llama_api_IP,
290
+ 'ooba': ooba_api_IP,
291
+ 'tabby': tabby_api_IP,
292
+ 'vllm': vllm_api_url,
293
+ 'ollama': ollama_api_url,
294
+ 'aphrodite': aphrodite_api_url,
295
+ 'custom_openai_api_ip': custom_openai_api_url
296
+ },
297
+ 'output_path': output_path,
298
+ 'processing_choice': processing_choice,
299
+ 'db_config': {
300
+ 'prompt_path': get_project_relative_path(config.get('Prompts', 'prompt_path', fallback='Databases/prompts.db')),
301
+ 'db_type': config.get('Database', 'type', fallback='sqlite'),
302
+ 'sqlite_path': get_project_relative_path(config.get('Database', 'sqlite_path', fallback='Databases/media_summary.db')),
303
+ 'elasticsearch_host': config.get('Database', 'elasticsearch_host', fallback='localhost'),
304
+ 'elasticsearch_port': config.getint('Database', 'elasticsearch_port', fallback=9200),
305
+ 'chroma_db_path': get_project_relative_path(config.get('Database', 'chroma_db_path', fallback='Databases/chroma.db'))
306
+ },
307
+ }
308
+
309
+ except Exception as e:
310
+ logging.error(f"Error loading config: {str(e)}")
311
+ return None
312
+
313
+
314
+ #
315
+ # End of Config loading
316
+ #######################################################################################################################
317
+
318
+
319
+ #######################################################################################################################
320
+ #
321
+ # Prompt Handling Functions
322
+
323
+
324
+
325
+ #
326
+ # End of Prompt Handling Functions
327
+ ### #############################################################################################################
328
+
329
+ #######################################################################################################################
330
+ #
331
+ # Misc-Functions
332
+
333
+ # Log file
334
+ # logging.basicConfig(filename='debug-runtime.log', encoding='utf-8', level=logging.DEBUG)
335
+
336
+ def format_metadata_as_text(metadata):
337
+ if not metadata:
338
+ return "No metadata available"
339
+
340
+ formatted_text = "Video Metadata:\n"
341
+ for key, value in metadata.items():
342
+ if value is not None:
343
+ if isinstance(value, list):
344
+ # Join list items with commas
345
+ formatted_value = ", ".join(str(item) for item in value)
346
+ elif key == 'upload_date' and len(str(value)) == 8:
347
+ # Format date as YYYY-MM-DD
348
+ formatted_value = f"{value[:4]}-{value[4:6]}-{value[6:]}"
349
+ elif key in ['view_count', 'like_count']:
350
+ # Format large numbers with commas
351
+ formatted_value = f"{value:,}"
352
+ elif key == 'duration':
353
+ # Convert seconds to HH:MM:SS format
354
+ hours, remainder = divmod(value, 3600)
355
+ minutes, seconds = divmod(remainder, 60)
356
+ formatted_value = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
357
+ else:
358
+ formatted_value = str(value)
359
+
360
+ formatted_text += f"{key.capitalize()}: {formatted_value}\n"
361
+ return formatted_text.strip()
362
+
363
+ # # Example usage:
364
+ # example_metadata = {
365
+ # 'title': 'Sample Video Title',
366
+ # 'uploader': 'Channel Name',
367
+ # 'upload_date': '20230615',
368
+ # 'view_count': 1000000,
369
+ # 'like_count': 50000,
370
+ # 'duration': 3725, # 1 hour, 2 minutes, 5 seconds
371
+ # 'tags': ['tag1', 'tag2', 'tag3'],
372
+ # 'description': 'This is a sample video description.'
373
+ # }
374
+ #
375
+ # print(format_metadata_as_text(example_metadata))
376
+
377
+
378
+ def convert_to_seconds(time_str):
379
+ if not time_str:
380
+ return 0
381
+
382
+ # If it's already a number, assume it's in seconds
383
+ if time_str.isdigit():
384
+ return int(time_str)
385
+
386
+ # Parse time string in format HH:MM:SS, MM:SS, or SS
387
+ time_parts = time_str.split(':')
388
+ if len(time_parts) == 3:
389
+ return int(timedelta(hours=int(time_parts[0]),
390
+ minutes=int(time_parts[1]),
391
+ seconds=int(time_parts[2])).total_seconds())
392
+ elif len(time_parts) == 2:
393
+ return int(timedelta(minutes=int(time_parts[0]),
394
+ seconds=int(time_parts[1])).total_seconds())
395
+ elif len(time_parts) == 1:
396
+ return int(time_parts[0])
397
+ else:
398
+ raise ValueError(f"Invalid time format: {time_str}")
399
+
400
+ #
401
+ # End of Misc-Functions
402
+ #######################################################################################################################
403
+
404
+
405
+ #######################################################################################################################
406
+ #
407
+ # File-saving Function Definitions
408
+ def save_to_file(video_urls, filename):
409
+ with open(filename, 'w') as file:
410
+ file.write('\n'.join(video_urls))
411
+ print(f"Video URLs saved to {filename}")
412
+
413
+
414
+ def save_segments_to_json(segments, file_name="transcription_segments.json"):
415
+ """
416
+ Save transcription segments to a JSON file.
417
+
418
+ Parameters:
419
+ segments (list): List of transcription segments
420
+ file_name (str): Name of the JSON file to save (default: "transcription_segments.json")
421
+
422
+ Returns:
423
+ str: Path to the saved JSON file
424
+ """
425
+ # Ensure the Results directory exists
426
+ os.makedirs("Results", exist_ok=True)
427
+
428
+ # Full path for the JSON file
429
+ json_file_path = os.path.join("Results", file_name)
430
+
431
+ # Save segments to JSON file
432
+ with open(json_file_path, 'w', encoding='utf-8') as json_file:
433
+ json.dump(segments, json_file, ensure_ascii=False, indent=4)
434
+
435
+ return json_file_path
436
+
437
+
438
+ def download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5):
439
+ temp_path = dest_path + '.tmp'
440
+
441
+ for attempt in range(max_retries):
442
+ try:
443
+ # Check if a partial download exists and get its size
444
+ resume_header = {}
445
+ if os.path.exists(temp_path):
446
+ resume_header = {'Range': f'bytes={os.path.getsize(temp_path)}-'}
447
+
448
+ response = requests.get(url, stream=True, headers=resume_header)
449
+ response.raise_for_status()
450
+
451
+ # Get the total file size from headers
452
+ total_size = int(response.headers.get('content-length', 0))
453
+ initial_pos = os.path.getsize(temp_path) if os.path.exists(temp_path) else 0
454
+
455
+ mode = 'ab' if 'Range' in response.headers else 'wb'
456
+ with open(temp_path, mode) as temp_file, tqdm(
457
+ total=total_size, unit='B', unit_scale=True, desc=dest_path, initial=initial_pos, ascii=True
458
+ ) as pbar:
459
+ for chunk in response.iter_content(chunk_size=8192):
460
+ if chunk: # filter out keep-alive new chunks
461
+ temp_file.write(chunk)
462
+ pbar.update(len(chunk))
463
+
464
+ # Verify the checksum if provided
465
+ if expected_checksum:
466
+ if not verify_checksum(temp_path, expected_checksum):
467
+ os.remove(temp_path)
468
+ raise ValueError("Downloaded file's checksum does not match the expected checksum")
469
+
470
+ # Move the file to the final destination
471
+ os.rename(temp_path, dest_path)
472
+ print("Download complete and verified!")
473
+ return dest_path
474
+
475
+ except Exception as e:
476
+ print(f"Attempt {attempt + 1} failed: {e}")
477
+ if attempt < max_retries - 1:
478
+ print(f"Retrying in {delay} seconds...")
479
+ time.sleep(delay)
480
+ else:
481
+ print("Max retries reached. Download failed.")
482
+ raise
483
+
484
+ def create_download_directory(title):
485
+ base_dir = "Results"
486
+ # Remove characters that are illegal in Windows filenames and normalize
487
+ safe_title = normalize_title(title)
488
+ logging.debug(f"{title} successfully normalized")
489
+ session_path = os.path.join(base_dir, safe_title)
490
+ if not os.path.exists(session_path):
491
+ os.makedirs(session_path, exist_ok=True)
492
+ logging.debug(f"Created directory for downloaded video: {session_path}")
493
+ else:
494
+ logging.debug(f"Directory already exists for downloaded video: {session_path}")
495
+ return session_path
496
+
497
+
498
+ def safe_read_file(file_path):
499
+ encodings = ['utf-8', 'utf-16', 'ascii', 'latin-1', 'iso-8859-1', 'cp1252']
500
+ for encoding in encodings:
501
+ try:
502
+ with open(file_path, 'r', encoding=encoding) as file:
503
+ return file.read()
504
+ except UnicodeDecodeError:
505
+ continue
506
+ except FileNotFoundError:
507
+ return f"File not found: {file_path}"
508
+ except Exception as e:
509
+ return f"An error occurred: {e}"
510
+ return f"Unable to decode the file {file_path} with any of the attempted encodings: {encodings}"
511
+
512
+ #
513
+ # End of Files-saving Function Definitions
514
+ #######################################################################################################################
515
+
516
+
517
+ #######################################################################################################################
518
+ #
519
+ # UUID-Functions
520
+
521
+ def generate_unique_filename(base_path, base_filename):
522
+ """Generate a unique filename by appending a counter if necessary."""
523
+ filename = base_filename
524
+ counter = 1
525
+ while os.path.exists(os.path.join(base_path, filename)):
526
+ name, ext = os.path.splitext(base_filename)
527
+ filename = f"{name}_{counter}{ext}"
528
+ counter += 1
529
+ return filename
530
+
531
+
532
+ def generate_unique_identifier(file_path):
533
+ filename = os.path.basename(file_path)
534
+ timestamp = int(time.time())
535
+
536
+ # Generate a hash of the file content
537
+ hasher = hashlib.md5()
538
+ with open(file_path, 'rb') as f:
539
+ buf = f.read()
540
+ hasher.update(buf)
541
+ content_hash = hasher.hexdigest()[:8] # Use first 8 characters of the hash
542
+
543
+ return f"local:{timestamp}:{content_hash}:{filename}"
544
+
545
+ #
546
+ # End of UUID-Functions
547
+ #######################################################################################################################
548
+
549
+
550
+ #######################################################################################################################
551
+ #
552
+ # Backup code
553
+
554
+ #
555
+ # End of backup code
556
+ #######################################################################################################################
557
+
558
+
559
+ #######################################################################################################################
560
+ #
561
+ # Sanitization/Verification Functions
562
+
563
+ # Helper function to validate URL format
564
+ def is_valid_url(url: str) -> bool:
565
+ regex = re.compile(
566
+ r'^(?:http|ftp)s?://' # http:// or https://
567
+ r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
568
+ r'localhost|' # localhost...
569
+ r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4
570
+ r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6
571
+ r'(?::\d+)?' # optional port
572
+ r'(?:/?|[/?]\S+)$', re.IGNORECASE)
573
+ return re.match(regex, url) is not None
574
+
575
+
576
+ def verify_checksum(file_path, expected_checksum):
577
+ sha256_hash = hashlib.sha256()
578
+ with open(file_path, 'rb') as f:
579
+ for byte_block in iter(lambda: f.read(4096), b''):
580
+ sha256_hash.update(byte_block)
581
+ return sha256_hash.hexdigest() == expected_checksum
582
+
583
+
584
+ def normalize_title(title):
585
+ # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
586
+ title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
587
+ title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '').replace('*', '').replace('?',
588
+ '').replace(
589
+ '<', '').replace('>', '').replace('|', '')
590
+ return title
591
+
592
+
593
+ def clean_youtube_url(url):
594
+ parsed_url = urlparse(url)
595
+ query_params = parse_qs(parsed_url.query)
596
+ if 'list' in query_params:
597
+ query_params.pop('list')
598
+ cleaned_query = urlencode(query_params, doseq=True)
599
+ cleaned_url = urlunparse(parsed_url._replace(query=cleaned_query))
600
+ return cleaned_url
601
+
602
+ def sanitize_filename(filename):
603
+ # Remove invalid characters and replace spaces with underscores
604
+ sanitized = re.sub(r'[<>:"/\\|?*]', '', filename)
605
+ sanitized = re.sub(r'\s+', ' ', sanitized).strip()
606
+ return sanitized
607
+
608
+
609
+ def format_transcription(content):
610
+ # Replace '\n' with actual line breaks
611
+ content = content.replace('\\n', '\n')
612
+ # Split the content by newlines first
613
+ lines = content.split('\n')
614
+ formatted_lines = []
615
+ for line in lines:
616
+ # Add extra space after periods for better readability
617
+ line = line.replace('.', '. ').replace('. ', '. ')
618
+
619
+ # Split into sentences using a more comprehensive regex
620
+ sentences = re.split('(?<=[.!?]) +', line)
621
+
622
+ # Trim whitespace from each sentence and add a line break
623
+ formatted_sentences = [sentence.strip() for sentence in sentences if sentence.strip()]
624
+
625
+ # Join the formatted sentences
626
+ formatted_lines.append(' '.join(formatted_sentences))
627
+
628
+ # Join the lines with HTML line breaks
629
+ formatted_content = '<br>'.join(formatted_lines)
630
+
631
+ return formatted_content
632
+
633
+
634
+ def format_file_path(file_path, fallback_path=None):
635
+ if file_path and os.path.exists(file_path):
636
+ logging.debug(f"File exists: {file_path}")
637
+ return file_path
638
+ elif fallback_path and os.path.exists(fallback_path):
639
+ logging.debug(f"File does not exist: {file_path}. Returning fallback path: {fallback_path}")
640
+ return fallback_path
641
+ else:
642
+ logging.debug(f"File does not exist: {file_path}. No fallback path available.")
643
+ return None
644
+
645
+ #
646
+ # End of Sanitization/Verification Functions
647
+ #######################################################################################################################
648
+
649
+
650
+ #######################################################################################################################
651
+ #
652
+ # DB Config Loading
653
+
654
+
655
+ def get_db_config():
656
+ # Get the directory of the current script
657
+ current_dir = os.path.dirname(os.path.abspath(__file__))
658
+ # Go up two levels to the project root directory (tldw)
659
+ project_root = os.path.dirname(os.path.dirname(current_dir))
660
+ # Construct the path to the config file
661
+ config_path = os.path.join(project_root, 'Config_Files', 'config.txt')
662
+ # Read the config file
663
+ config = configparser.ConfigParser()
664
+ config.read(config_path)
665
+ # Return the database configuration
666
+ return {
667
+ 'type': config['Database']['type'],
668
+ 'sqlite_path': config.get('Database', 'sqlite_path', fallback='./Databases/media_summary.db'),
669
+ 'elasticsearch_host': config.get('Database', 'elasticsearch_host', fallback='localhost'),
670
+ 'elasticsearch_port': config.getint('Database', 'elasticsearch_port', fallback=9200)
671
+ }
672
+
673
+
674
+
675
+
676
+ #
677
+ # End of DB Config Loading
678
+ #######################################################################################################################