oceansweep commited on
Commit
e15e1c7
·
verified ·
1 Parent(s): af37bcf

Upload 22 files

Browse files
App_Function_Libraries/Audio/Audio_Files.py ADDED
@@ -0,0 +1,692 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Audio_Files.py
2
+ #########################################
3
+ # Audio Processing Library
4
+ # This library is used to download or load audio files from a local directory.
5
+ #
6
+ ####
7
+ #
8
+ # Functions:
9
+ #
10
+ # download_audio_file(url, save_path)
11
+ # process_audio(
12
+ # process_audio_file(audio_url, audio_file, whisper_model="small.en", api_name=None, api_key=None)
13
+ #
14
+ #
15
+ #########################################
16
+ # Imports
17
+ import json
18
+ import logging
19
+ import os
20
+ import subprocess
21
+ import tempfile
22
+ import uuid
23
+ from datetime import datetime
24
+ from pathlib import Path
25
+
26
+ import requests
27
+ import yt_dlp
28
+
29
+ from App_Function_Libraries.Audio.Audio_Transcription_Lib import speech_to_text
30
+ from App_Function_Libraries.Chunk_Lib import improved_chunking_process
31
+ #
32
+ # Local Imports
33
+ from App_Function_Libraries.DB.DB_Manager import add_media_to_database, add_media_with_keywords, \
34
+ check_media_and_whisper_model
35
+ from App_Function_Libraries.Summarization.Summarization_General_Lib import save_transcription_and_summary, perform_transcription, \
36
+ perform_summarization
37
+ from App_Function_Libraries.Utils.Utils import create_download_directory, save_segments_to_json, downloaded_files, \
38
+ sanitize_filename
39
+ from App_Function_Libraries.Video_DL_Ingestion_Lib import extract_metadata
40
+
41
+ #
42
+ #######################################################################################################################
43
+ # Function Definitions
44
+ #
45
+
46
+ MAX_FILE_SIZE = 500 * 1024 * 1024
47
+
48
+
49
+ def download_audio_file(url, current_whisper_model="", use_cookies=False, cookies=None):
50
+ try:
51
+ # Check if media already exists in the database and compare whisper models
52
+ should_download, reason = check_media_and_whisper_model(
53
+ url=url,
54
+ current_whisper_model=current_whisper_model
55
+ )
56
+
57
+ if not should_download:
58
+ logging.info(f"Skipping audio download: {reason}")
59
+ return None
60
+
61
+ logging.info(f"Proceeding with audio download: {reason}")
62
+
63
+ # Set up the request headers
64
+ headers = {}
65
+ if use_cookies and cookies:
66
+ try:
67
+ cookie_dict = json.loads(cookies)
68
+ headers['Cookie'] = '; '.join([f'{k}={v}' for k, v in cookie_dict.items()])
69
+ except json.JSONDecodeError:
70
+ logging.warning("Invalid cookie format. Proceeding without cookies.")
71
+
72
+ # Make the request
73
+ response = requests.get(url, headers=headers, stream=True)
74
+ # Raise an exception for bad status codes
75
+ response.raise_for_status()
76
+
77
+ # Get the file size
78
+ file_size = int(response.headers.get('content-length', 0))
79
+ if file_size > 500 * 1024 * 1024: # 500 MB limit
80
+ raise ValueError("File size exceeds the 500MB limit.")
81
+
82
+ # Generate a unique filename
83
+ file_name = f"audio_{uuid.uuid4().hex[:8]}.mp3"
84
+ save_path = os.path.join('downloads', file_name)
85
+
86
+ # Ensure the downloads directory exists
87
+ os.makedirs('downloads', exist_ok=True)
88
+
89
+
90
+ # Download the file
91
+ with open(save_path, 'wb') as f:
92
+ for chunk in response.iter_content(chunk_size=8192):
93
+ if chunk:
94
+ f.write(chunk)
95
+
96
+ logging.info(f"Audio file downloaded successfully: {save_path}")
97
+ return save_path
98
+
99
+ except requests.RequestException as e:
100
+ logging.error(f"Error downloading audio file: {str(e)}")
101
+ raise
102
+ except ValueError as e:
103
+ logging.error(str(e))
104
+ raise
105
+ except Exception as e:
106
+ logging.error(f"Unexpected error downloading audio file: {str(e)}")
107
+ raise
108
+
109
+
110
+ def process_audio(
111
+ audio_file_path,
112
+ num_speakers=2,
113
+ whisper_model="small.en",
114
+ custom_prompt_input=None,
115
+ offset=0,
116
+ api_name=None,
117
+ api_key=None,
118
+ vad_filter=False,
119
+ rolling_summarization=False,
120
+ detail_level=0.01,
121
+ keywords="default,no_keyword_set",
122
+ chunk_text_by_words=False,
123
+ max_words=0,
124
+ chunk_text_by_sentences=False,
125
+ max_sentences=0,
126
+ chunk_text_by_paragraphs=False,
127
+ max_paragraphs=0,
128
+ chunk_text_by_tokens=False,
129
+ max_tokens=0
130
+ ):
131
+ try:
132
+
133
+ # Perform transcription
134
+ audio_file_path, segments = perform_transcription(audio_file_path, offset, whisper_model, vad_filter)
135
+
136
+ if audio_file_path is None or segments is None:
137
+ logging.error("Process_Audio: Transcription failed or segments not available.")
138
+ return "Process_Audio: Transcription failed.", None, None, None, None, None
139
+
140
+ logging.debug(f"Process_Audio: Transcription audio_file: {audio_file_path}")
141
+ logging.debug(f"Process_Audio: Transcription segments: {segments}")
142
+
143
+ transcription_text = {'audio_file': audio_file_path, 'transcription': segments}
144
+ logging.debug(f"Process_Audio: Transcription text: {transcription_text}")
145
+
146
+ # Save segments to JSON
147
+ segments_json_path = save_segments_to_json(segments)
148
+
149
+ # Perform summarization
150
+ summary_text = None
151
+ if api_name:
152
+ if rolling_summarization is not None:
153
+ pass
154
+ # FIXME rolling summarization
155
+ # summary_text = rolling_summarize_function(
156
+ # transcription_text,
157
+ # detail=detail_level,
158
+ # api_name=api_name,
159
+ # api_key=api_key,
160
+ # custom_prompt=custom_prompt_input,
161
+ # chunk_by_words=chunk_text_by_words,
162
+ # max_words=max_words,
163
+ # chunk_by_sentences=chunk_text_by_sentences,
164
+ # max_sentences=max_sentences,
165
+ # chunk_by_paragraphs=chunk_text_by_paragraphs,
166
+ # max_paragraphs=max_paragraphs,
167
+ # chunk_by_tokens=chunk_text_by_tokens,
168
+ # max_tokens=max_tokens
169
+ # )
170
+ else:
171
+ summary_text = perform_summarization(api_name, segments_json_path, custom_prompt_input, api_key)
172
+
173
+ if summary_text is None:
174
+ logging.error("Summary text is None. Check summarization function.")
175
+ summary_file_path = None
176
+ else:
177
+ summary_text = 'Summary not available'
178
+ summary_file_path = None
179
+
180
+ # Save transcription and summary
181
+ download_path = create_download_directory("Audio_Processing")
182
+ json_file_path, summary_file_path = save_transcription_and_summary(transcription_text, summary_text,
183
+ download_path)
184
+
185
+ # Update function call to add_media_to_database so that it properly applies the title, author and file type
186
+ # Add to database
187
+ add_media_to_database(None, {'title': 'Audio File', 'author': 'Unknown'}, segments, summary_text, keywords,
188
+ custom_prompt_input, whisper_model)
189
+
190
+ return transcription_text, summary_text, json_file_path, summary_file_path, None, None
191
+
192
+ except Exception as e:
193
+ logging.error(f"Error in process_audio: {str(e)}")
194
+ return str(e), None, None, None, None, None
195
+
196
+
197
+ def process_single_audio(audio_file_path, whisper_model, api_name, api_key, keep_original,custom_keywords, source,
198
+ custom_prompt_input, chunk_method, max_chunk_size, chunk_overlap, use_adaptive_chunking,
199
+ use_multi_level_chunking, chunk_language):
200
+ progress = []
201
+ transcription = ""
202
+ summary = ""
203
+
204
+ def update_progress(message):
205
+ progress.append(message)
206
+ return "\n".join(progress)
207
+
208
+ try:
209
+ # Check file size before processing
210
+ file_size = os.path.getsize(audio_file_path)
211
+ if file_size > MAX_FILE_SIZE:
212
+ update_progress(f"File size ({file_size / (1024 * 1024):.2f} MB) exceeds the maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f} MB. Skipping this file.")
213
+ return "\n".join(progress), "", ""
214
+
215
+ # Perform transcription
216
+ update_progress("Starting transcription...")
217
+ segments = speech_to_text(audio_file_path, whisper_model=whisper_model)
218
+ transcription = " ".join([segment['Text'] for segment in segments])
219
+ update_progress("Audio transcribed successfully.")
220
+
221
+ # Perform summarization if API is provided
222
+ if api_name and api_key:
223
+ update_progress("Starting summarization...")
224
+ summary = perform_summarization(api_name, transcription, "Summarize the following audio transcript",
225
+ api_key)
226
+ update_progress("Audio summarized successfully.")
227
+ else:
228
+ summary = "No summary available"
229
+
230
+ # Prepare keywords
231
+ keywords = "audio,transcription"
232
+ if custom_keywords:
233
+ keywords += f",{custom_keywords}"
234
+
235
+ # Add to database
236
+ add_media_with_keywords(
237
+ url=source,
238
+ title=os.path.basename(audio_file_path),
239
+ media_type='audio',
240
+ content=transcription,
241
+ keywords=keywords,
242
+ prompt="Summarize the following audio transcript",
243
+ summary=summary,
244
+ transcription_model=whisper_model,
245
+ author="Unknown",
246
+ ingestion_date=None # This will use the current date
247
+ )
248
+ update_progress("Audio file added to database successfully.")
249
+
250
+ if not keep_original and source != "Uploaded File":
251
+ os.remove(audio_file_path)
252
+ update_progress(f"Temporary file {audio_file_path} removed.")
253
+ elif keep_original and source != "Uploaded File":
254
+ update_progress(f"Original audio file kept at: {audio_file_path}")
255
+
256
+ except Exception as e:
257
+ update_progress(f"Error processing {source}: {str(e)}")
258
+ transcription = f"Error: {str(e)}"
259
+ summary = "No summary due to error"
260
+
261
+ return "\n".join(progress), transcription, summary
262
+
263
+
264
+ def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key, use_cookies, cookies, keep_original,
265
+ custom_keywords, custom_prompt_input, chunk_method, max_chunk_size, chunk_overlap,
266
+ use_adaptive_chunking, use_multi_level_chunking, chunk_language, diarize):
267
+ progress = []
268
+ temp_files = []
269
+ all_transcriptions = []
270
+ all_summaries = []
271
+
272
+ def update_progress(message):
273
+ progress.append(message)
274
+ return "\n".join(progress)
275
+
276
+ def cleanup_files():
277
+ for file in temp_files:
278
+ try:
279
+ if os.path.exists(file):
280
+ os.remove(file)
281
+ update_progress(f"Temporary file {file} removed.")
282
+ except Exception as e:
283
+ update_progress(f"Failed to remove temporary file {file}: {str(e)}")
284
+
285
+ def reencode_mp3(mp3_file_path):
286
+ try:
287
+ reencoded_mp3_path = mp3_file_path.replace(".mp3", "_reencoded.mp3")
288
+ subprocess.run([ffmpeg_cmd, '-i', mp3_file_path, '-codec:a', 'libmp3lame', reencoded_mp3_path], check=True)
289
+ update_progress(f"Re-encoded {mp3_file_path} to {reencoded_mp3_path}.")
290
+ return reencoded_mp3_path
291
+ except subprocess.CalledProcessError as e:
292
+ update_progress(f"Error re-encoding {mp3_file_path}: {str(e)}")
293
+ raise
294
+
295
+ def convert_mp3_to_wav(mp3_file_path):
296
+ try:
297
+ wav_file_path = mp3_file_path.replace(".mp3", ".wav")
298
+ subprocess.run([ffmpeg_cmd, '-i', mp3_file_path, wav_file_path], check=True)
299
+ update_progress(f"Converted {mp3_file_path} to {wav_file_path}.")
300
+ return wav_file_path
301
+ except subprocess.CalledProcessError as e:
302
+ update_progress(f"Error converting {mp3_file_path} to WAV: {str(e)}")
303
+ raise
304
+
305
+ try:
306
+ # Check and set the ffmpeg command
307
+ global ffmpeg_cmd
308
+ if os.name == "nt":
309
+ logging.debug("Running on Windows")
310
+ ffmpeg_cmd = os.path.join(os.getcwd(), "Bin", "ffmpeg.exe")
311
+ else:
312
+ ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
313
+
314
+ # Ensure ffmpeg is accessible
315
+ if not os.path.exists(ffmpeg_cmd) and os.name == "nt":
316
+ raise FileNotFoundError(f"ffmpeg executable not found at path: {ffmpeg_cmd}")
317
+
318
+ # Define chunk options early to avoid undefined errors
319
+ chunk_options = {
320
+ 'method': chunk_method,
321
+ 'max_size': max_chunk_size,
322
+ 'overlap': chunk_overlap,
323
+ 'adaptive': use_adaptive_chunking,
324
+ 'multi_level': use_multi_level_chunking,
325
+ 'language': chunk_language
326
+ }
327
+
328
+ # Process multiple URLs
329
+ urls = [url.strip() for url in audio_urls.split('\n') if url.strip()]
330
+
331
+ for i, url in enumerate(urls):
332
+ update_progress(f"Processing URL {i + 1}/{len(urls)}: {url}")
333
+
334
+ # Download and process audio file
335
+ audio_file_path = download_audio_file(url, use_cookies, cookies)
336
+ if not os.path.exists(audio_file_path):
337
+ update_progress(f"Downloaded file not found: {audio_file_path}")
338
+ continue
339
+
340
+ temp_files.append(audio_file_path)
341
+ update_progress("Audio file downloaded successfully.")
342
+
343
+ # Re-encode MP3 to fix potential issues
344
+ reencoded_mp3_path = reencode_mp3(audio_file_path)
345
+ if not os.path.exists(reencoded_mp3_path):
346
+ update_progress(f"Re-encoded file not found: {reencoded_mp3_path}")
347
+ continue
348
+
349
+ temp_files.append(reencoded_mp3_path)
350
+
351
+ # Convert re-encoded MP3 to WAV
352
+ wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
353
+ if not os.path.exists(wav_file_path):
354
+ update_progress(f"Converted WAV file not found: {wav_file_path}")
355
+ continue
356
+
357
+ temp_files.append(wav_file_path)
358
+
359
+ # Initialize transcription
360
+ transcription = ""
361
+
362
+ # Transcribe audio
363
+ if diarize:
364
+ segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=True)
365
+ else:
366
+ segments = speech_to_text(wav_file_path, whisper_model=whisper_model)
367
+
368
+ # Handle segments nested under 'segments' key
369
+ if isinstance(segments, dict) and 'segments' in segments:
370
+ segments = segments['segments']
371
+
372
+ if isinstance(segments, list):
373
+ transcription = " ".join([segment.get('Text', '') for segment in segments])
374
+ update_progress("Audio transcribed successfully.")
375
+ else:
376
+ update_progress("Unexpected segments format received from speech_to_text.")
377
+ logging.error(f"Unexpected segments format: {segments}")
378
+ continue
379
+
380
+ if not transcription.strip():
381
+ update_progress("Transcription is empty.")
382
+ else:
383
+ # Apply chunking
384
+ chunked_text = improved_chunking_process(transcription, chunk_options)
385
+
386
+ # Summarize
387
+ if api_name:
388
+ try:
389
+ summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
390
+ update_progress("Audio summarized successfully.")
391
+ except Exception as e:
392
+ logging.error(f"Error during summarization: {str(e)}")
393
+ summary = "Summary generation failed"
394
+ else:
395
+ summary = "No summary available (API not provided)"
396
+
397
+ all_transcriptions.append(transcription)
398
+ all_summaries.append(summary)
399
+
400
+ # Add to database
401
+ add_media_with_keywords(
402
+ url=url,
403
+ title=os.path.basename(wav_file_path),
404
+ media_type='audio',
405
+ content=transcription,
406
+ keywords=custom_keywords,
407
+ prompt=custom_prompt_input,
408
+ summary=summary,
409
+ transcription_model=whisper_model,
410
+ author="Unknown",
411
+ ingestion_date=datetime.now().strftime('%Y-%m-%d')
412
+ )
413
+ update_progress("Audio file processed and added to database.")
414
+
415
+ # Process uploaded file if provided
416
+ if audio_file:
417
+ if os.path.getsize(audio_file.name) > MAX_FILE_SIZE:
418
+ update_progress(
419
+ f"Uploaded file size exceeds the maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f}MB. Skipping this file.")
420
+ else:
421
+ # Re-encode MP3 to fix potential issues
422
+ reencoded_mp3_path = reencode_mp3(audio_file.name)
423
+ if not os.path.exists(reencoded_mp3_path):
424
+ update_progress(f"Re-encoded file not found: {reencoded_mp3_path}")
425
+ return update_progress("Processing failed: Re-encoded file not found"), "", ""
426
+
427
+ temp_files.append(reencoded_mp3_path)
428
+
429
+ # Convert re-encoded MP3 to WAV
430
+ wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
431
+ if not os.path.exists(wav_file_path):
432
+ update_progress(f"Converted WAV file not found: {wav_file_path}")
433
+ return update_progress("Processing failed: Converted WAV file not found"), "", ""
434
+
435
+ temp_files.append(wav_file_path)
436
+
437
+ # Initialize transcription
438
+ transcription = ""
439
+
440
+ if diarize:
441
+ segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=True)
442
+ else:
443
+ segments = speech_to_text(wav_file_path, whisper_model=whisper_model)
444
+
445
+ # Handle segments nested under 'segments' key
446
+ if isinstance(segments, dict) and 'segments' in segments:
447
+ segments = segments['segments']
448
+
449
+ if isinstance(segments, list):
450
+ transcription = " ".join([segment.get('Text', '') for segment in segments])
451
+ else:
452
+ update_progress("Unexpected segments format received from speech_to_text.")
453
+ logging.error(f"Unexpected segments format: {segments}")
454
+
455
+ chunked_text = improved_chunking_process(transcription, chunk_options)
456
+
457
+ if api_name and api_key:
458
+ try:
459
+ summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
460
+ update_progress("Audio summarized successfully.")
461
+ except Exception as e:
462
+ logging.error(f"Error during summarization: {str(e)}")
463
+ summary = "Summary generation failed"
464
+ else:
465
+ summary = "No summary available (API not provided)"
466
+
467
+ all_transcriptions.append(transcription)
468
+ all_summaries.append(summary)
469
+
470
+ add_media_with_keywords(
471
+ url="Uploaded File",
472
+ title=os.path.basename(wav_file_path),
473
+ media_type='audio',
474
+ content=transcription,
475
+ keywords=custom_keywords,
476
+ prompt=custom_prompt_input,
477
+ summary=summary,
478
+ transcription_model=whisper_model,
479
+ author="Unknown",
480
+ ingestion_date=datetime.now().strftime('%Y-%m-%d')
481
+ )
482
+ update_progress("Uploaded file processed and added to database.")
483
+
484
+ # Final cleanup
485
+ if not keep_original:
486
+ cleanup_files()
487
+
488
+ final_progress = update_progress("All processing complete.")
489
+ final_transcriptions = "\n\n".join(all_transcriptions)
490
+ final_summaries = "\n\n".join(all_summaries)
491
+
492
+ return final_progress, final_transcriptions, final_summaries
493
+
494
+ except Exception as e:
495
+ logging.error(f"Error processing audio files: {str(e)}")
496
+ cleanup_files()
497
+ return update_progress(f"Processing failed: {str(e)}"), "", ""
498
+
499
+
500
+ def download_youtube_audio(url):
501
+ try:
502
+ # Determine ffmpeg path based on the operating system.
503
+ ffmpeg_path = './Bin/ffmpeg.exe' if os.name == 'nt' else 'ffmpeg'
504
+
505
+ # Create a temporary directory
506
+ with tempfile.TemporaryDirectory() as temp_dir:
507
+ # Extract information about the video
508
+ with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
509
+ info_dict = ydl.extract_info(url, download=False)
510
+ sanitized_title = sanitize_filename(info_dict['title'])
511
+
512
+ # Setup the temporary filenames
513
+ temp_video_path = Path(temp_dir) / f"{sanitized_title}_temp.mp4"
514
+ temp_audio_path = Path(temp_dir) / f"{sanitized_title}.mp3"
515
+
516
+ # Initialize yt-dlp with options for downloading
517
+ ydl_opts = {
518
+ 'format': 'bestaudio[ext=m4a]/best[height<=480]', # Prefer best audio, or video up to 480p
519
+ 'ffmpeg_location': ffmpeg_path,
520
+ 'outtmpl': str(temp_video_path),
521
+ 'noplaylist': True,
522
+ 'quiet': True
523
+ }
524
+
525
+ # Execute yt-dlp to download the video/audio
526
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
527
+ ydl.download([url])
528
+
529
+ # Check if the file exists
530
+ if not temp_video_path.exists():
531
+ raise FileNotFoundError(f"Expected file was not found: {temp_video_path}")
532
+
533
+ # Use ffmpeg to extract audio
534
+ ffmpeg_command = [
535
+ ffmpeg_path,
536
+ '-i', str(temp_video_path),
537
+ '-vn', # No video
538
+ '-acodec', 'libmp3lame',
539
+ '-b:a', '192k',
540
+ str(temp_audio_path)
541
+ ]
542
+ subprocess.run(ffmpeg_command, check=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
543
+
544
+ # Check if the audio file was created
545
+ if not temp_audio_path.exists():
546
+ raise FileNotFoundError(f"Expected audio file was not found: {temp_audio_path}")
547
+
548
+ # Create a persistent directory for the download if it doesn't exist
549
+ persistent_dir = Path("downloads")
550
+ persistent_dir.mkdir(exist_ok=True)
551
+
552
+ # Move the file from the temporary directory to the persistent directory
553
+ persistent_file_path = persistent_dir / f"{sanitized_title}.mp3"
554
+ os.replace(str(temp_audio_path), str(persistent_file_path))
555
+
556
+ # Add the file to the list of downloaded files
557
+ downloaded_files.append(str(persistent_file_path))
558
+
559
+ return str(persistent_file_path), f"Audio downloaded successfully: {sanitized_title}.mp3"
560
+ except Exception as e:
561
+ return None, f"Error downloading audio: {str(e)}"
562
+
563
+
564
+ def process_podcast(url, title, author, keywords, custom_prompt, api_name, api_key, whisper_model,
565
+ keep_original=False, enable_diarization=False, use_cookies=False, cookies=None,
566
+ chunk_method=None, max_chunk_size=300, chunk_overlap=0, use_adaptive_chunking=False,
567
+ use_multi_level_chunking=False, chunk_language='english'):
568
+ progress = []
569
+ error_message = ""
570
+ temp_files = []
571
+
572
+ def update_progress(message):
573
+ progress.append(message)
574
+ return "\n".join(progress)
575
+
576
+ def cleanup_files():
577
+ if not keep_original:
578
+ for file in temp_files:
579
+ try:
580
+ if os.path.exists(file):
581
+ os.remove(file)
582
+ update_progress(f"Temporary file {file} removed.")
583
+ except Exception as e:
584
+ update_progress(f"Failed to remove temporary file {file}: {str(e)}")
585
+
586
+ try:
587
+ # Download podcast
588
+ audio_file = download_audio_file(url, use_cookies, cookies)
589
+ temp_files.append(audio_file)
590
+ update_progress("Podcast downloaded successfully.")
591
+
592
+ # Extract metadata
593
+ metadata = extract_metadata(url)
594
+ title = title or metadata.get('title', 'Unknown Podcast')
595
+ author = author or metadata.get('uploader', 'Unknown Author')
596
+
597
+ # Format metadata for storage
598
+ metadata_text = f"""
599
+ Metadata:
600
+ Title: {title}
601
+ Author: {author}
602
+ Series: {metadata.get('series', 'N/A')}
603
+ Episode: {metadata.get('episode', 'N/A')}
604
+ Season: {metadata.get('season', 'N/A')}
605
+ Upload Date: {metadata.get('upload_date', 'N/A')}
606
+ Duration: {metadata.get('duration', 'N/A')} seconds
607
+ Description: {metadata.get('description', 'N/A')}
608
+ """
609
+
610
+ # Update keywords
611
+ new_keywords = []
612
+ if metadata.get('series'):
613
+ new_keywords.append(f"series:{metadata['series']}")
614
+ if metadata.get('episode'):
615
+ new_keywords.append(f"episode:{metadata['episode']}")
616
+ if metadata.get('season'):
617
+ new_keywords.append(f"season:{metadata['season']}")
618
+
619
+ keywords = f"{keywords},{','.join(new_keywords)}" if keywords else ','.join(new_keywords)
620
+
621
+ update_progress(f"Metadata extracted - Title: {title}, Author: {author}, Keywords: {keywords}")
622
+
623
+ # Transcribe the podcast
624
+ try:
625
+ if enable_diarization:
626
+ segments = speech_to_text(audio_file, whisper_model=whisper_model, diarize=True)
627
+ else:
628
+ segments = speech_to_text(audio_file, whisper_model=whisper_model)
629
+ transcription = " ".join([segment['Text'] for segment in segments])
630
+ update_progress("Podcast transcribed successfully.")
631
+ except Exception as e:
632
+ error_message = f"Transcription failed: {str(e)}"
633
+ raise
634
+
635
+ # Apply chunking
636
+ chunk_options = {
637
+ 'method': chunk_method,
638
+ 'max_size': max_chunk_size,
639
+ 'overlap': chunk_overlap,
640
+ 'adaptive': use_adaptive_chunking,
641
+ 'multi_level': use_multi_level_chunking,
642
+ 'language': chunk_language
643
+ }
644
+ chunked_text = improved_chunking_process(transcription, chunk_options)
645
+
646
+ # Combine metadata and transcription
647
+ full_content = metadata_text + "\n\nTranscription:\n" + transcription
648
+
649
+ # Summarize if API is provided
650
+ summary = None
651
+ if api_name and api_key:
652
+ try:
653
+ summary = perform_summarization(api_name, chunked_text, custom_prompt, api_key)
654
+ update_progress("Podcast summarized successfully.")
655
+ except Exception as e:
656
+ error_message = f"Summarization failed: {str(e)}"
657
+ raise
658
+
659
+ # Add to database
660
+ try:
661
+ add_media_with_keywords(
662
+ url=url,
663
+ title=title,
664
+ media_type='podcast',
665
+ content=full_content,
666
+ keywords=keywords,
667
+ prompt=custom_prompt,
668
+ summary=summary or "No summary available",
669
+ transcription_model=whisper_model,
670
+ author=author,
671
+ ingestion_date=datetime.now().strftime('%Y-%m-%d')
672
+ )
673
+ update_progress("Podcast added to database successfully.")
674
+ except Exception as e:
675
+ error_message = f"Error adding podcast to database: {str(e)}"
676
+ raise
677
+
678
+ # Cleanup
679
+ cleanup_files()
680
+
681
+ return (update_progress("Processing complete."), full_content, summary or "No summary generated.",
682
+ title, author, keywords, error_message)
683
+
684
+ except Exception as e:
685
+ logging.error(f"Error processing podcast: {str(e)}")
686
+ cleanup_files()
687
+ return update_progress(f"Processing failed: {str(e)}"), "", "", "", "", "", str(e)
688
+
689
+
690
+ #
691
+ #
692
+ #######################################################################################################################
App_Function_Libraries/Audio/Audio_Transcription_Lib.py ADDED
@@ -0,0 +1,329 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Audio_Transcription_Lib.py
2
+ #########################################
3
+ # Transcription Library
4
+ # This library is used to perform transcription of audio files.
5
+ # Currently, uses faster_whisper for transcription.
6
+ #
7
+ ####################
8
+ # Function List
9
+ #
10
+ # 1. convert_to_wav(video_file_path, offset=0, overwrite=False)
11
+ # 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False)
12
+ #
13
+ ####################
14
+ #
15
+ # Import necessary libraries to run solo for testing
16
+ import gc
17
+ import json
18
+ import logging
19
+ import os
20
+ import queue
21
+ import sys
22
+ import subprocess
23
+ import tempfile
24
+ import threading
25
+ import time
26
+ # DEBUG Imports
27
+ #from memory_profiler import profile
28
+ import pyaudio
29
+ from faster_whisper import WhisperModel as OriginalWhisperModel
30
+ from typing import Optional, Union, List, Dict, Any
31
+ #
32
+ # Import Local
33
+ from App_Function_Libraries.Utils.Utils import load_comprehensive_config
34
+ #
35
+ #######################################################################################################################
36
+ # Function Definitions
37
+ #
38
+
39
+ # Convert video .m4a into .wav using ffmpeg
40
+ # ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
41
+ # https://www.gyan.dev/ffmpeg/builds/
42
+ #
43
+
44
+
45
+ whisper_model_instance = None
46
+ config = load_comprehensive_config()
47
+ processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
48
+
49
+
50
+
51
+ class WhisperModel(OriginalWhisperModel):
52
+ tldw_dir = os.path.dirname(os.path.dirname(__file__))
53
+ default_download_root = os.path.join(tldw_dir, 'App_Function_Libraries', 'models', 'Whisper')
54
+
55
+ valid_model_sizes = [
56
+ "tiny.en", "tiny", "base.en", "base", "small.en", "small", "medium.en", "medium",
57
+ "large-v1", "large-v2", "large-v3", "large", "distil-large-v2", "distil-medium.en",
58
+ "distil-small.en", "distil-large-v3"
59
+ ]
60
+
61
+ def __init__(
62
+ self,
63
+ model_size_or_path: str,
64
+ device: str = "auto",
65
+ device_index: Union[int, List[int]] = 0,
66
+ compute_type: str = "default",
67
+ cpu_threads: int = 16,
68
+ num_workers: int = 1,
69
+ download_root: Optional[str] = None,
70
+ local_files_only: bool = False,
71
+ files: Optional[Dict[str, Any]] = None,
72
+ **model_kwargs: Any
73
+ ):
74
+ if download_root is None:
75
+ download_root = self.default_download_root
76
+
77
+ os.makedirs(download_root, exist_ok=True)
78
+
79
+ # FIXME - validate....
80
+ # Also write an integration test...
81
+ # Check if model_size_or_path is a valid model size
82
+ if model_size_or_path in self.valid_model_sizes:
83
+ # It's a model size, so we'll use the download_root
84
+ model_path = os.path.join(download_root, model_size_or_path)
85
+ if not os.path.isdir(model_path):
86
+ # If it doesn't exist, we'll let the parent class download it
87
+ model_size_or_path = model_size_or_path # Keep the original model size
88
+ else:
89
+ # If it exists, use the full path
90
+ model_size_or_path = model_path
91
+ else:
92
+ # It's not a valid model size, so assume it's a path
93
+ model_size_or_path = os.path.abspath(model_size_or_path)
94
+
95
+ super().__init__(
96
+ model_size_or_path,
97
+ device=device,
98
+ device_index=device_index,
99
+ compute_type=compute_type,
100
+ cpu_threads=cpu_threads,
101
+ num_workers=num_workers,
102
+ download_root=download_root,
103
+ local_files_only=local_files_only,
104
+ # Maybe? idk, FIXME
105
+ # files=files,
106
+ # **model_kwargs
107
+ )
108
+
109
+ def get_whisper_model(model_name, device):
110
+ global whisper_model_instance
111
+ if whisper_model_instance is None:
112
+ logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
113
+ whisper_model_instance = WhisperModel(model_name, device=device)
114
+ return whisper_model_instance
115
+
116
+ # # FIXME: This is a temporary solution.
117
+ # # This doesn't clear older models, which means potentially a lot of memory is being used...
118
+ # def get_whisper_model(model_name, device):
119
+ # global whisper_model_instance
120
+ # if whisper_model_instance is None:
121
+ # from faster_whisper import WhisperModel
122
+ # logging.info(f"Initializing new WhisperModel with size {model_name} on device {device}")
123
+ #
124
+ # # FIXME - add logic to detect if the model is already downloaded
125
+ # # want to first check if the model is already downloaded
126
+ # # if not, download it using the existing logic in 'WhisperModel'
127
+ # # https://github.com/SYSTRAN/faster-whisper/blob/d57c5b40b06e59ec44240d93485a95799548af50/faster_whisper/transcribe.py#L584
128
+ # # Designated path should be `tldw/App_Function_Libraries/models/Whisper/`
129
+ # WhisperModel.download_root = os.path.join(os.path.dirname(__file__), 'models', 'Whisper')
130
+ # os.makedirs(WhisperModel.download_root, exist_ok=True)
131
+ # whisper_model_instance = WhisperModel(model_name, device=device)
132
+ # return whisper_model_instance
133
+
134
+
135
+ # os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
136
+ #DEBUG
137
+ #@profile
138
+ def convert_to_wav(video_file_path, offset=0, overwrite=False):
139
+ out_path = os.path.splitext(video_file_path)[0] + ".wav"
140
+
141
+ if os.path.exists(out_path) and not overwrite:
142
+ print(f"File '{out_path}' already exists. Skipping conversion.")
143
+ logging.info(f"Skipping conversion as file already exists: {out_path}")
144
+ return out_path
145
+ print("Starting conversion process of .m4a to .WAV")
146
+ out_path = os.path.splitext(video_file_path)[0] + ".wav"
147
+
148
+ try:
149
+ if os.name == "nt":
150
+ logging.debug("ffmpeg being ran on windows")
151
+
152
+ if sys.platform.startswith('win'):
153
+ ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
154
+ logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
155
+ else:
156
+ ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
157
+
158
+ command = [
159
+ ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
160
+ "-ss", "00:00:00", # Start at the beginning of the video
161
+ "-i", video_file_path,
162
+ "-ar", "16000", # Audio sample rate
163
+ "-ac", "1", # Number of audio channels
164
+ "-c:a", "pcm_s16le", # Audio codec
165
+ out_path
166
+ ]
167
+ try:
168
+ # Redirect stdin from null device to prevent ffmpeg from waiting for input
169
+ with open(os.devnull, 'rb') as null_file:
170
+ result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
171
+ if result.returncode == 0:
172
+ logging.info("FFmpeg executed successfully")
173
+ logging.debug("FFmpeg output: %s", result.stdout)
174
+ else:
175
+ logging.error("Error in running FFmpeg")
176
+ logging.error("FFmpeg stderr: %s", result.stderr)
177
+ raise RuntimeError(f"FFmpeg error: {result.stderr}")
178
+ except Exception as e:
179
+ logging.error("Error occurred - ffmpeg doesn't like windows")
180
+ raise RuntimeError("ffmpeg failed")
181
+ elif os.name == "posix":
182
+ os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
183
+ else:
184
+ raise RuntimeError("Unsupported operating system")
185
+ logging.info("Conversion to WAV completed: %s", out_path)
186
+ except subprocess.CalledProcessError as e:
187
+ logging.error("Error executing FFmpeg command: %s", str(e))
188
+ raise RuntimeError("Error converting video file to WAV")
189
+ except Exception as e:
190
+ logging.error("speech-to-text: Error transcribing audio: %s", str(e))
191
+ return {"error": str(e)}
192
+ gc.collect()
193
+ return out_path
194
+
195
+
196
+ # Transcribe .wav into .segments.json
197
+ #DEBUG
198
+ #@profile
199
+ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
200
+ global whisper_model_instance, processing_choice
201
+ logging.info('speech-to-text: Loading faster_whisper model: %s', whisper_model)
202
+
203
+ time_start = time.time()
204
+ if audio_file_path is None:
205
+ raise ValueError("speech-to-text: No audio file provided")
206
+ logging.info("speech-to-text: Audio file path: %s", audio_file_path)
207
+
208
+ try:
209
+ _, file_ending = os.path.splitext(audio_file_path)
210
+ out_file = audio_file_path.replace(file_ending, ".segments.json")
211
+ prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json")
212
+ if os.path.exists(out_file):
213
+ logging.info("speech-to-text: Segments file already exists: %s", out_file)
214
+ with open(out_file) as f:
215
+ global segments
216
+ segments = json.load(f)
217
+ return segments
218
+
219
+ logging.info('speech-to-text: Starting transcription...')
220
+ options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter)
221
+ transcribe_options = dict(task="transcribe", **options)
222
+ # use function and config at top of file
223
+ logging.debug("speech-to-text: Using whisper model: %s", whisper_model)
224
+ whisper_model_instance = get_whisper_model(whisper_model, processing_choice)
225
+ segments_raw, info = whisper_model_instance.transcribe(audio_file_path, **transcribe_options)
226
+
227
+ segments = []
228
+ for segment_chunk in segments_raw:
229
+ chunk = {
230
+ "Time_Start": segment_chunk.start,
231
+ "Time_End": segment_chunk.end,
232
+ "Text": segment_chunk.text
233
+ }
234
+ logging.debug("Segment: %s", chunk)
235
+ segments.append(chunk)
236
+ # Print to verify its working
237
+ print(f"{segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
238
+
239
+ # Log it as well.
240
+ logging.debug(
241
+ f"Transcribed Segment: {segment_chunk.start:.2f}s - {segment_chunk.end:.2f}s | {segment_chunk.text}")
242
+
243
+ if segments:
244
+ segments[0]["Text"] = f"This text was transcribed using whisper model: {whisper_model}\n\n" + segments[0]["Text"]
245
+
246
+ if not segments:
247
+ raise RuntimeError("No transcription produced. The audio file may be invalid or empty.")
248
+ logging.info("speech-to-text: Transcription completed in %.2f seconds", time.time() - time_start)
249
+
250
+ # Save the segments to a JSON file - prettified and non-prettified
251
+ # FIXME so this is an optional flag to save either the prettified json file or the normal one
252
+ save_json = True
253
+ if save_json:
254
+ logging.info("speech-to-text: Saving segments to JSON file")
255
+ output_data = {'segments': segments}
256
+
257
+ logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
258
+ with open(prettified_out_file, 'w') as f:
259
+ json.dump(output_data, f, indent=2)
260
+
261
+ logging.info("speech-to-text: Saving JSON to %s", out_file)
262
+ with open(out_file, 'w') as f:
263
+ json.dump(output_data, f)
264
+
265
+ logging.debug(f"speech-to-text: returning {segments[:500]}")
266
+ gc.collect()
267
+ return segments
268
+
269
+ except Exception as e:
270
+ logging.error("speech-to-text: Error transcribing audio: %s", str(e))
271
+ raise RuntimeError("speech-to-text: Error transcribing audio")
272
+
273
+
274
+ def record_audio(duration, sample_rate=16000, chunk_size=1024):
275
+ p = pyaudio.PyAudio()
276
+ stream = p.open(format=pyaudio.paInt16,
277
+ channels=1,
278
+ rate=sample_rate,
279
+ input=True,
280
+ frames_per_buffer=chunk_size)
281
+
282
+ print("Recording...")
283
+ frames = []
284
+ stop_recording = threading.Event()
285
+ audio_queue = queue.Queue()
286
+
287
+ def audio_callback():
288
+ for _ in range(0, int(sample_rate / chunk_size * duration)):
289
+ if stop_recording.is_set():
290
+ break
291
+ data = stream.read(chunk_size)
292
+ audio_queue.put(data)
293
+
294
+ audio_thread = threading.Thread(target=audio_callback)
295
+ audio_thread.start()
296
+
297
+ return p, stream, audio_queue, stop_recording, audio_thread
298
+
299
+
300
+ def stop_recording(p, stream, audio_queue, stop_recording_event, audio_thread):
301
+ stop_recording_event.set()
302
+ audio_thread.join()
303
+
304
+ frames = []
305
+ while not audio_queue.empty():
306
+ frames.append(audio_queue.get())
307
+
308
+ print("Recording finished.")
309
+
310
+ stream.stop_stream()
311
+ stream.close()
312
+ p.terminate()
313
+
314
+ return b''.join(frames)
315
+
316
+ def save_audio_temp(audio_data, sample_rate=16000):
317
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
318
+ import wave
319
+ wf = wave.open(temp_file.name, 'wb')
320
+ wf.setnchannels(1)
321
+ wf.setsampwidth(2)
322
+ wf.setframerate(sample_rate)
323
+ wf.writeframes(audio_data)
324
+ wf.close()
325
+ return temp_file.name
326
+
327
+ #
328
+ #
329
+ #######################################################################################################################
App_Function_Libraries/Audio/Diarization_Lib.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Diarization_Lib.py
2
+ #########################################
3
+ # Diarization Library
4
+ # This library is used to perform diarization of audio files.
5
+ # Currently, uses FIXME for transcription.
6
+ #
7
+ ####################
8
+ ####################
9
+ # Function List
10
+ #
11
+ # 1. speaker_diarize(video_file_path, segments, embedding_model = "pyannote/embedding", embedding_size=512, num_speakers=0)
12
+ #
13
+ ####################
14
+ # Import necessary libraries
15
+ import logging
16
+ from pathlib import Path
17
+ from typing import Dict, List, Any
18
+
19
+ #
20
+ # Import Local Libraries
21
+ from App_Function_Libraries.Audio.Audio_Transcription_Lib import speech_to_text
22
+ #
23
+ # Import 3rd Party Libraries
24
+ from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization
25
+ import yaml
26
+ #
27
+ #######################################################################################################################
28
+ # Function Definitions
29
+ #
30
+
31
+ def load_pipeline_from_pretrained(path_to_config: str | Path) -> SpeakerDiarization:
32
+ path_to_config = Path(path_to_config).resolve()
33
+ logging.debug(f"Loading pyannote pipeline from {path_to_config}...")
34
+
35
+ if not path_to_config.exists():
36
+ raise FileNotFoundError(f"Config file not found: {path_to_config}")
37
+
38
+ # Load the YAML configuration
39
+ with open(path_to_config, 'r') as config_file:
40
+ config = yaml.safe_load(config_file)
41
+
42
+ # Debug: print the entire config
43
+ logging.debug(f"Loaded config: {config}")
44
+
45
+ # Create the SpeakerDiarization pipeline
46
+ try:
47
+ pipeline = SpeakerDiarization(
48
+ segmentation=config['pipeline']['params']['segmentation'],
49
+ embedding=config['pipeline']['params']['embedding'],
50
+ clustering=config['pipeline']['params']['clustering'],
51
+ )
52
+ except KeyError as e:
53
+ logging.error(f"Error accessing config key: {e}")
54
+ raise
55
+
56
+ # Set other parameters
57
+ try:
58
+ pipeline_params = {
59
+ "segmentation": {},
60
+ "clustering": {},
61
+ }
62
+
63
+ if 'params' in config and 'segmentation' in config['params']:
64
+ if 'min_duration_off' in config['params']['segmentation']:
65
+ pipeline_params["segmentation"]["min_duration_off"] = config['params']['segmentation']['min_duration_off']
66
+
67
+ if 'params' in config and 'clustering' in config['params']:
68
+ if 'method' in config['params']['clustering']:
69
+ pipeline_params["clustering"]["method"] = config['params']['clustering']['method']
70
+ if 'min_cluster_size' in config['params']['clustering']:
71
+ pipeline_params["clustering"]["min_cluster_size"] = config['params']['clustering']['min_cluster_size']
72
+ if 'threshold' in config['params']['clustering']:
73
+ pipeline_params["clustering"]["threshold"] = config['params']['clustering']['threshold']
74
+
75
+ if 'pipeline' in config and 'params' in config['pipeline']:
76
+ if 'embedding_batch_size' in config['pipeline']['params']:
77
+ pipeline_params["embedding_batch_size"] = config['pipeline']['params']['embedding_batch_size']
78
+ if 'embedding_exclude_overlap' in config['pipeline']['params']:
79
+ pipeline_params["embedding_exclude_overlap"] = config['pipeline']['params']['embedding_exclude_overlap']
80
+ if 'segmentation_batch_size' in config['pipeline']['params']:
81
+ pipeline_params["segmentation_batch_size"] = config['pipeline']['params']['segmentation_batch_size']
82
+
83
+ logging.debug(f"Pipeline params: {pipeline_params}")
84
+ pipeline.instantiate(pipeline_params)
85
+ except KeyError as e:
86
+ logging.error(f"Error accessing config key: {e}")
87
+ raise
88
+ except Exception as e:
89
+ logging.error(f"Error instantiating pipeline: {e}")
90
+ raise
91
+
92
+ return pipeline
93
+
94
+
95
+ def audio_diarization(audio_file_path: str) -> list:
96
+ logging.info('audio-diarization: Loading pyannote pipeline')
97
+
98
+ base_dir = Path(__file__).parent.resolve()
99
+ config_path = base_dir / 'models' / 'pyannote_diarization_config.yaml'
100
+ logging.info(f"audio-diarization: Loading pipeline from {config_path}")
101
+
102
+ try:
103
+ pipeline = load_pipeline_from_pretrained(config_path)
104
+ except Exception as e:
105
+ logging.error(f"Failed to load pipeline: {str(e)}")
106
+ raise
107
+
108
+ logging.info(f"audio-diarization: Audio file path: {audio_file_path}")
109
+
110
+ try:
111
+ logging.info('audio-diarization: Starting diarization...')
112
+ diarization_result = pipeline(audio_file_path)
113
+
114
+ segments = []
115
+ for turn, _, speaker in diarization_result.itertracks(yield_label=True):
116
+ segment = {
117
+ "start": turn.start,
118
+ "end": turn.end,
119
+ "speaker": speaker
120
+ }
121
+ logging.debug(f"Segment: {segment}")
122
+ segments.append(segment)
123
+ logging.info("audio-diarization: Diarization completed with pyannote")
124
+
125
+ return segments
126
+
127
+ except Exception as e:
128
+ logging.error(f"audio-diarization: Error performing diarization: {str(e)}")
129
+ raise RuntimeError("audio-diarization: Error performing diarization") from e
130
+
131
+
132
+ # Old
133
+ # def audio_diarization(audio_file_path):
134
+ # logging.info('audio-diarization: Loading pyannote pipeline')
135
+ #
136
+ # #config file loading
137
+ # current_dir = os.path.dirname(os.path.abspath(__file__))
138
+ # # Construct the path to the config file
139
+ # config_path = os.path.join(current_dir, 'Config_Files', 'config.txt')
140
+ # # Read the config file
141
+ # config = configparser.ConfigParser()
142
+ # config.read(config_path)
143
+ # processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
144
+ #
145
+ # base_dir = Path(__file__).parent.resolve()
146
+ # config_path = base_dir / 'models' / 'config.yaml'
147
+ # pipeline = load_pipeline_from_pretrained(config_path)
148
+ #
149
+ # time_start = time.time()
150
+ # if audio_file_path is None:
151
+ # raise ValueError("audio-diarization: No audio file provided")
152
+ # logging.info("audio-diarization: Audio file path: %s", audio_file_path)
153
+ #
154
+ # try:
155
+ # _, file_ending = os.path.splitext(audio_file_path)
156
+ # out_file = audio_file_path.replace(file_ending, ".diarization.json")
157
+ # prettified_out_file = audio_file_path.replace(file_ending, ".diarization_pretty.json")
158
+ # if os.path.exists(out_file):
159
+ # logging.info("audio-diarization: Diarization file already exists: %s", out_file)
160
+ # with open(out_file) as f:
161
+ # global diarization_result
162
+ # diarization_result = json.load(f)
163
+ # return diarization_result
164
+ #
165
+ # logging.info('audio-diarization: Starting diarization...')
166
+ # diarization_result = pipeline(audio_file_path)
167
+ #
168
+ # segments = []
169
+ # for turn, _, speaker in diarization_result.itertracks(yield_label=True):
170
+ # chunk = {
171
+ # "Time_Start": turn.start,
172
+ # "Time_End": turn.end,
173
+ # "Speaker": speaker
174
+ # }
175
+ # logging.debug("Segment: %s", chunk)
176
+ # segments.append(chunk)
177
+ # logging.info("audio-diarization: Diarization completed with pyannote")
178
+ #
179
+ # output_data = {'segments': segments}
180
+ #
181
+ # logging.info("audio-diarization: Saving prettified JSON to %s", prettified_out_file)
182
+ # with open(prettified_out_file, 'w') as f:
183
+ # json.dump(output_data, f, indent=2)
184
+ #
185
+ # logging.info("audio-diarization: Saving JSON to %s", out_file)
186
+ # with open(out_file, 'w') as f:
187
+ # json.dump(output_data, f)
188
+ #
189
+ # except Exception as e:
190
+ # logging.error("audio-diarization: Error performing diarization: %s", str(e))
191
+ # raise RuntimeError("audio-diarization: Error performing diarization")
192
+ # return segments
193
+
194
+ def combine_transcription_and_diarization(audio_file_path: str) -> List[Dict[str, Any]]:
195
+ logging.info('combine-transcription-and-diarization: Starting transcription and diarization...')
196
+
197
+ try:
198
+ logging.info('Performing speech-to-text...')
199
+ transcription_result = speech_to_text(audio_file_path)
200
+ logging.info(f"Transcription result type: {type(transcription_result)}")
201
+ logging.info(f"Transcription result: {transcription_result[:3] if isinstance(transcription_result, list) and len(transcription_result) > 3 else transcription_result}")
202
+
203
+ logging.info('Performing audio diarization...')
204
+ diarization_result = audio_diarization(audio_file_path)
205
+ logging.info(f"Diarization result type: {type(diarization_result)}")
206
+ logging.info(f"Diarization result sample: {diarization_result[:3] if isinstance(diarization_result, list) and len(diarization_result) > 3 else diarization_result}")
207
+
208
+ if not transcription_result:
209
+ logging.error("Empty result from transcription")
210
+ return []
211
+
212
+ if not diarization_result:
213
+ logging.error("Empty result from diarization")
214
+ return []
215
+
216
+ # Handle the case where transcription_result is a dict with a 'segments' key
217
+ if isinstance(transcription_result, dict) and 'segments' in transcription_result:
218
+ transcription_segments = transcription_result['segments']
219
+ elif isinstance(transcription_result, list):
220
+ transcription_segments = transcription_result
221
+ else:
222
+ logging.error(f"Unexpected transcription result format: {type(transcription_result)}")
223
+ return []
224
+
225
+ logging.info(f"Number of transcription segments: {len(transcription_segments)}")
226
+ logging.info(f"Transcription segments sample: {transcription_segments[:3] if len(transcription_segments) > 3 else transcription_segments}")
227
+
228
+ if not isinstance(diarization_result, list):
229
+ logging.error(f"Unexpected diarization result format: {type(diarization_result)}")
230
+ return []
231
+
232
+ combined_result = []
233
+ for transcription_segment in transcription_segments:
234
+ if not isinstance(transcription_segment, dict):
235
+ logging.warning(f"Unexpected transcription segment format: {transcription_segment}")
236
+ continue
237
+
238
+ for diarization_segment in diarization_result:
239
+ if not isinstance(diarization_segment, dict):
240
+ logging.warning(f"Unexpected diarization segment format: {diarization_segment}")
241
+ continue
242
+
243
+ try:
244
+ trans_start = transcription_segment.get('Time_Start', 0)
245
+ trans_end = transcription_segment.get('Time_End', 0)
246
+ diar_start = diarization_segment.get('start', 0)
247
+ diar_end = diarization_segment.get('end', 0)
248
+
249
+ if trans_start >= diar_start and trans_end <= diar_end:
250
+ combined_segment = {
251
+ "Time_Start": trans_start,
252
+ "Time_End": trans_end,
253
+ "Speaker": diarization_segment.get('speaker', 'Unknown'),
254
+ "Text": transcription_segment.get('Text', '')
255
+ }
256
+ combined_result.append(combined_segment)
257
+ break
258
+ except Exception as e:
259
+ logging.error(f"Error processing segment: {str(e)}")
260
+ logging.error(f"Transcription segment: {transcription_segment}")
261
+ logging.error(f"Diarization segment: {diarization_segment}")
262
+ continue
263
+
264
+ logging.info(f"Combined result length: {len(combined_result)}")
265
+ logging.info(f"Combined result sample: {combined_result[:3] if len(combined_result) > 3 else combined_result}")
266
+ return combined_result
267
+
268
+ except Exception as e:
269
+ logging.error(f"Error in combine_transcription_and_diarization: {str(e)}", exc_info=True)
270
+ return []
271
+
272
+
273
+ #
274
+ #
275
+ #######################################################################################################################
App_Function_Libraries/Audio/__init__.py ADDED
File without changes
App_Function_Libraries/Chat.py CHANGED
@@ -1,6 +1,6 @@
1
  # Chat.py
2
  # Chat functions for interacting with the LLMs as chatbots
3
-
4
  # Imports
5
  import json
6
  import logging
@@ -15,9 +15,9 @@ from pathlib import Path
15
  # Local Imports
16
  from App_Function_Libraries.DB.DB_Manager import get_conversation_name, save_chat_history_to_database
17
  from App_Function_Libraries.LLM_API_Calls import chat_with_openai, chat_with_anthropic, chat_with_cohere, \
18
- chat_with_groq, chat_with_openrouter, chat_with_deepseek, chat_with_mistral, chat_with_huggingface#, chat_with_vllm
19
  from App_Function_Libraries.LLM_API_Calls_Local import chat_with_aphrodite, chat_with_local_llm, chat_with_ollama, \
20
- chat_with_kobold, chat_with_llama, chat_with_oobabooga, chat_with_tabbyapi
21
  from App_Function_Libraries.DB.SQLite_DB import load_media_content
22
  from App_Function_Libraries.Utils.Utils import generate_unique_filename
23
  #
@@ -54,8 +54,8 @@ def chat_api_call(api_endpoint, api_key, input_data, prompt, temp, system_messag
54
  response = chat_with_oobabooga(input_data, api_key, prompt, temp, system_message)
55
  elif api_endpoint.lower() == "tabbyapi":
56
  response = chat_with_tabbyapi(input_data, prompt, temp, system_message)
57
- #elif api_endpoint.lower() == "vllm":
58
- # response = chat_with_vllm(input_data, prompt, system_message)
59
  elif api_endpoint.lower() == "local-llm":
60
  response = chat_with_local_llm(input_data, prompt, temp, system_message)
61
  elif api_endpoint.lower() == "huggingface":
@@ -64,6 +64,8 @@ def chat_api_call(api_endpoint, api_key, input_data, prompt, temp, system_messag
64
  response = chat_with_ollama(input_data, prompt, temp, system_message)
65
  elif api_endpoint.lower() == "aphrodite":
66
  response = chat_with_aphrodite(input_data, prompt, temp, system_message)
 
 
67
  else:
68
  raise ValueError(f"Unsupported API endpoint: {api_endpoint}")
69
 
@@ -114,6 +116,8 @@ def chat(message, history, media_content, selected_parts, api_endpoint, api_key,
114
 
115
  # Use the existing API request code based on the selected endpoint
116
  response = chat_api_call(api_endpoint, api_key, input_data, prompt, temp, system_message)
 
 
117
  except Exception as e:
118
  logging.error(f"Error in chat function: {str(e)}")
119
  return f"An error occurred: {str(e)}"
@@ -279,26 +283,60 @@ def update_chat_content(selected_item, use_content, use_summary, use_prompt, ite
279
  print(f"Debug - Update Chat Content - No item selected or item not in mapping")
280
  return {}, []
281
 
 
 
 
 
 
 
 
 
282
 
283
  CHARACTERS_FILE = Path('.', 'Helper_Scripts', 'Character_Cards', 'Characters.json')
284
 
 
285
  def save_character(character_data):
286
- if CHARACTERS_FILE.exists():
287
- with CHARACTERS_FILE.open('r') as f:
288
- characters = json.load(f)
289
- else:
290
- characters = {}
291
 
292
- characters[character_data['name']] = character_data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
293
 
294
- with CHARACTERS_FILE.open('w') as f:
295
- json.dump(characters, f, indent=2)
296
 
297
 
298
  def load_characters():
299
- if os.path.exists(CHARACTERS_FILE):
300
- with open(CHARACTERS_FILE, 'r') as f:
301
- return json.load(f)
 
 
 
 
302
  return {}
303
 
304
 
 
1
  # Chat.py
2
  # Chat functions for interacting with the LLMs as chatbots
3
+ import base64
4
  # Imports
5
  import json
6
  import logging
 
15
  # Local Imports
16
  from App_Function_Libraries.DB.DB_Manager import get_conversation_name, save_chat_history_to_database
17
  from App_Function_Libraries.LLM_API_Calls import chat_with_openai, chat_with_anthropic, chat_with_cohere, \
18
+ chat_with_groq, chat_with_openrouter, chat_with_deepseek, chat_with_mistral, chat_with_huggingface #, chat_with_vllm
19
  from App_Function_Libraries.LLM_API_Calls_Local import chat_with_aphrodite, chat_with_local_llm, chat_with_ollama, \
20
+ chat_with_kobold, chat_with_llama, chat_with_oobabooga, chat_with_tabbyapi, chat_with_vllm, chat_with_custom_openai
21
  from App_Function_Libraries.DB.SQLite_DB import load_media_content
22
  from App_Function_Libraries.Utils.Utils import generate_unique_filename
23
  #
 
54
  response = chat_with_oobabooga(input_data, api_key, prompt, temp, system_message)
55
  elif api_endpoint.lower() == "tabbyapi":
56
  response = chat_with_tabbyapi(input_data, prompt, temp, system_message)
57
+ elif api_endpoint.lower() == "vllm":
58
+ response = chat_with_vllm(input_data, prompt, system_message)
59
  elif api_endpoint.lower() == "local-llm":
60
  response = chat_with_local_llm(input_data, prompt, temp, system_message)
61
  elif api_endpoint.lower() == "huggingface":
 
64
  response = chat_with_ollama(input_data, prompt, temp, system_message)
65
  elif api_endpoint.lower() == "aphrodite":
66
  response = chat_with_aphrodite(input_data, prompt, temp, system_message)
67
+ elif api_endpoint.lower() == "custom-openai-api":
68
+ response = chat_with_custom_openai(api_key, input_data, prompt, temp, system_message)
69
  else:
70
  raise ValueError(f"Unsupported API endpoint: {api_endpoint}")
71
 
 
116
 
117
  # Use the existing API request code based on the selected endpoint
118
  response = chat_api_call(api_endpoint, api_key, input_data, prompt, temp, system_message)
119
+
120
+ return response
121
  except Exception as e:
122
  logging.error(f"Error in chat function: {str(e)}")
123
  return f"An error occurred: {str(e)}"
 
283
  print(f"Debug - Update Chat Content - No item selected or item not in mapping")
284
  return {}, []
285
 
286
+ #
287
+ # End of Chat functions
288
+ ##########################################################################################################################
289
+
290
+
291
+ ##########################################################################################################################
292
+ #
293
+ # Character Card Functions
294
 
295
  CHARACTERS_FILE = Path('.', 'Helper_Scripts', 'Character_Cards', 'Characters.json')
296
 
297
+
298
  def save_character(character_data):
299
+ characters_file = os.path.join(os.path.dirname(__file__), '..', 'Helper_Scripts', 'Character_Cards', 'Characters.json')
300
+ characters_dir = os.path.dirname(characters_file)
 
 
 
301
 
302
+ try:
303
+ if os.path.exists(characters_file):
304
+ with open(characters_file, 'r') as f:
305
+ characters = json.load(f)
306
+ else:
307
+ characters = {}
308
+
309
+ char_name = character_data['name']
310
+
311
+ # Save the image separately if it exists
312
+ if 'image' in character_data:
313
+ img_data = base64.b64decode(character_data['image'])
314
+ img_filename = f"{char_name.replace(' ', '_')}.png"
315
+ img_path = os.path.join(characters_dir, img_filename)
316
+ with open(img_path, 'wb') as f:
317
+ f.write(img_data)
318
+ character_data['image_path'] = os.path.abspath(img_path)
319
+ del character_data['image'] # Remove the base64 image data from the JSON
320
+
321
+ characters[char_name] = character_data
322
+
323
+ with open(characters_file, 'w') as f:
324
+ json.dump(characters, f, indent=2)
325
+
326
+ logging.info(f"Character '{char_name}' saved successfully.")
327
+ except Exception as e:
328
+ logging.error(f"Error saving character: {str(e)}")
329
 
 
 
330
 
331
 
332
  def load_characters():
333
+ characters_file = os.path.join(os.path.dirname(__file__), '..', 'Helper_Scripts', 'Character_Cards', 'Characters.json')
334
+ if os.path.exists(characters_file):
335
+ with open(characters_file, 'r') as f:
336
+ characters = json.load(f)
337
+ logging.debug(f"Loaded {len(characters)} characters from {characters_file}")
338
+ return characters
339
+ logging.warning(f"Characters file not found: {characters_file}")
340
  return {}
341
 
342
 
App_Function_Libraries/Chunk_Lib.py CHANGED
@@ -7,6 +7,7 @@
7
  ####
8
  # Import necessary libraries
9
  import hashlib
 
10
  import logging
11
  import re
12
  from typing import Any, Dict, List, Optional, Tuple
@@ -72,42 +73,53 @@ def load_document(file_path):
72
 
73
  def improved_chunking_process(text: str, custom_chunk_options: Dict[str, Any] = None) -> List[Dict[str, Any]]:
74
  logging.debug("Improved chunking process started...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
  options = chunk_options.copy()
76
  if custom_chunk_options:
77
  options.update(custom_chunk_options)
78
 
79
  chunk_method = options.get('method', 'words')
80
- base_size = options.get('base_size', 1000)
81
- min_size = options.get('min_size', 100)
82
  max_size = options.get('max_size', 2000)
83
  overlap = options.get('overlap', 0)
84
  language = options.get('language', None)
85
- adaptive = options.get('adaptive', False)
86
- multi_level = options.get('multi_level', False)
87
 
88
  if language is None:
89
  language = detect_language(text)
90
 
91
- if adaptive:
92
- max_chunk_size = adaptive_chunk_size(text, base_size, min_size, max_size)
93
- else:
94
- max_chunk_size = base_size
95
-
96
- if multi_level:
97
- chunks = multi_level_chunking(text, chunk_method, max_chunk_size, overlap, language)
98
- else:
99
- chunks = chunk_text(text, chunk_method, max_chunk_size, overlap, language)
100
 
101
  chunks_with_metadata = []
 
102
  for i, chunk in enumerate(chunks):
103
- metadata = get_chunk_metadata(
104
- chunk,
105
- text,
106
- chunk_type=chunk_method,
107
- language=language
108
- )
109
- metadata['chunk_index'] = i
110
- metadata['total_chunks'] = len(chunks)
 
 
 
111
 
112
  chunks_with_metadata.append({
113
  'text': chunk,
@@ -117,6 +129,7 @@ def improved_chunking_process(text: str, custom_chunk_options: Dict[str, Any] =
117
  return chunks_with_metadata
118
 
119
 
 
120
  def multi_level_chunking(text: str, method: str, max_size: int, overlap: int, language: str) -> List[str]:
121
  logging.debug("Multi-level chunking process started...")
122
  # First level: chunk by paragraphs
 
7
  ####
8
  # Import necessary libraries
9
  import hashlib
10
+ import json
11
  import logging
12
  import re
13
  from typing import Any, Dict, List, Optional, Tuple
 
73
 
74
  def improved_chunking_process(text: str, custom_chunk_options: Dict[str, Any] = None) -> List[Dict[str, Any]]:
75
  logging.debug("Improved chunking process started...")
76
+
77
+ # Extract JSON metadata if present
78
+ json_content = {}
79
+ try:
80
+ json_end = text.index("}\n") + 1
81
+ json_content = json.loads(text[:json_end])
82
+ text = text[json_end:].strip()
83
+ logging.debug(f"Extracted JSON metadata: {json_content}")
84
+ except (ValueError, json.JSONDecodeError):
85
+ logging.debug("No JSON metadata found at the beginning of the text")
86
+
87
+ # Extract any additional header text
88
+ header_match = re.match(r"(This text was transcribed using.*?)\n\n", text, re.DOTALL)
89
+ header_text = ""
90
+ if header_match:
91
+ header_text = header_match.group(1)
92
+ text = text[len(header_text):].strip()
93
+ logging.debug(f"Extracted header text: {header_text}")
94
+
95
  options = chunk_options.copy()
96
  if custom_chunk_options:
97
  options.update(custom_chunk_options)
98
 
99
  chunk_method = options.get('method', 'words')
 
 
100
  max_size = options.get('max_size', 2000)
101
  overlap = options.get('overlap', 0)
102
  language = options.get('language', None)
 
 
103
 
104
  if language is None:
105
  language = detect_language(text)
106
 
107
+ chunks = chunk_text(text, chunk_method, max_size, overlap, language)
 
 
 
 
 
 
 
 
108
 
109
  chunks_with_metadata = []
110
+ total_chunks = len(chunks)
111
  for i, chunk in enumerate(chunks):
112
+ metadata = {
113
+ 'chunk_index': i,
114
+ 'total_chunks': total_chunks,
115
+ 'chunk_method': chunk_method,
116
+ 'max_size': max_size,
117
+ 'overlap': overlap,
118
+ 'language': language,
119
+ 'relative_position': i / total_chunks
120
+ }
121
+ metadata.update(json_content) # Add the extracted JSON content to metadata
122
+ metadata['header_text'] = header_text # Add the header text to metadata
123
 
124
  chunks_with_metadata.append({
125
  'text': chunk,
 
129
  return chunks_with_metadata
130
 
131
 
132
+
133
  def multi_level_chunking(text: str, method: str, max_size: int, overlap: int, language: str) -> List[str]:
134
  logging.debug("Multi-level chunking process started...")
135
  # First level: chunk by paragraphs
App_Function_Libraries/Gradio_Related.py CHANGED
@@ -16,6 +16,8 @@ import gradio as gr
16
  # Local Imports
17
  from App_Function_Libraries.DB.DB_Manager import get_db_config
18
  from App_Function_Libraries.Gradio_UI.Audio_ingestion_tab import create_audio_processing_tab
 
 
19
  from App_Function_Libraries.Gradio_UI.Chat_ui import create_chat_management_tab, \
20
  create_chat_interface_four, create_chat_interface_multi_api, create_chat_interface_stacked, create_chat_interface
21
  from App_Function_Libraries.Gradio_UI.Config_tab import create_config_editor_tab
@@ -39,8 +41,9 @@ from App_Function_Libraries.Gradio_UI.RAG_QA_Chat_tab import create_rag_qa_chat_
39
  from App_Function_Libraries.Gradio_UI.Re_summarize_tab import create_resummary_tab
40
  from App_Function_Libraries.Gradio_UI.Search_Tab import create_prompt_view_tab, create_prompt_search_tab, \
41
  create_search_summaries_tab, create_viewing_tab, create_search_tab
42
- from App_Function_Libraries.Gradio_UI.RAG_Chat_tab import create_embeddings_tab, create_rag_tab, \
43
- create_view_embeddings_tab
 
44
  from App_Function_Libraries.Gradio_UI.Trash import create_view_trash_tab, create_empty_trash_tab, \
45
  create_delete_trash_tab, create_search_and_mark_trash_tab
46
  from App_Function_Libraries.Gradio_UI.Utilities import create_utilities_yt_timestamp_tab, create_utilities_yt_audio_tab, \
@@ -260,11 +263,9 @@ def launch_ui(share_public=None, server_mode=False):
260
  create_search_tab()
261
  create_search_summaries_tab()
262
 
263
- with gr.TabItem("RAG Search / Embeddings"):
264
  create_rag_tab()
265
  create_rag_qa_chat_tab()
266
- create_embeddings_tab()
267
- create_view_embeddings_tab()
268
 
269
  with gr.TabItem("Chat with an LLM"):
270
  create_chat_interface()
@@ -274,9 +275,12 @@ def launch_ui(share_public=None, server_mode=False):
274
  create_chat_with_llamafile_tab()
275
  create_chat_management_tab()
276
  chat_workflows_tab()
277
- from App_Function_Libraries.Gradio_UI.Writing_tab import create_character_card_interaction_tab
 
278
  create_character_card_interaction_tab()
279
 
 
 
280
  with gr.TabItem("View DB Items"):
281
  create_viewing_tab()
282
  create_prompt_view_tab()
@@ -295,6 +299,11 @@ def launch_ui(share_public=None, server_mode=False):
295
  # FIXME
296
  #create_compare_transcripts_tab()
297
 
 
 
 
 
 
298
  with gr.TabItem("Writing Tools"):
299
  with gr.Tabs():
300
  from App_Function_Libraries.Gradio_UI.Writing_tab import create_document_feedback_tab
 
16
  # Local Imports
17
  from App_Function_Libraries.DB.DB_Manager import get_db_config
18
  from App_Function_Libraries.Gradio_UI.Audio_ingestion_tab import create_audio_processing_tab
19
+ from App_Function_Libraries.Gradio_UI.Character_Interaction_tab import create_character_card_interaction_tab, \
20
+ create_multiple_character_chat_tab, create_narrator_controlled_conversation_tab
21
  from App_Function_Libraries.Gradio_UI.Chat_ui import create_chat_management_tab, \
22
  create_chat_interface_four, create_chat_interface_multi_api, create_chat_interface_stacked, create_chat_interface
23
  from App_Function_Libraries.Gradio_UI.Config_tab import create_config_editor_tab
 
41
  from App_Function_Libraries.Gradio_UI.Re_summarize_tab import create_resummary_tab
42
  from App_Function_Libraries.Gradio_UI.Search_Tab import create_prompt_view_tab, create_prompt_search_tab, \
43
  create_search_summaries_tab, create_viewing_tab, create_search_tab
44
+ from App_Function_Libraries.Gradio_UI.RAG_Chat_tab import create_rag_tab
45
+ from App_Function_Libraries.Gradio_UI.Embeddings_tab import create_embeddings_tab, create_view_embeddings_tab, \
46
+ create_purge_embeddings_tab
47
  from App_Function_Libraries.Gradio_UI.Trash import create_view_trash_tab, create_empty_trash_tab, \
48
  create_delete_trash_tab, create_search_and_mark_trash_tab
49
  from App_Function_Libraries.Gradio_UI.Utilities import create_utilities_yt_timestamp_tab, create_utilities_yt_audio_tab, \
 
263
  create_search_tab()
264
  create_search_summaries_tab()
265
 
266
+ with gr.TabItem("RAG Search"):
267
  create_rag_tab()
268
  create_rag_qa_chat_tab()
 
 
269
 
270
  with gr.TabItem("Chat with an LLM"):
271
  create_chat_interface()
 
275
  create_chat_with_llamafile_tab()
276
  create_chat_management_tab()
277
  chat_workflows_tab()
278
+ create_multiple_character_chat_tab()
279
+ create_narrator_controlled_conversation_tab()
280
  create_character_card_interaction_tab()
281
 
282
+
283
+
284
  with gr.TabItem("View DB Items"):
285
  create_viewing_tab()
286
  create_prompt_view_tab()
 
299
  # FIXME
300
  #create_compare_transcripts_tab()
301
 
302
+ with gr.TabItem("Embeddings Management"):
303
+ create_embeddings_tab()
304
+ create_view_embeddings_tab()
305
+ create_purge_embeddings_tab()
306
+
307
  with gr.TabItem("Writing Tools"):
308
  with gr.Tabs():
309
  from App_Function_Libraries.Gradio_UI.Writing_tab import create_document_feedback_tab
App_Function_Libraries/Local_File_Processing_Lib.py CHANGED
@@ -20,10 +20,10 @@
20
 
21
  # Import necessary libraries
22
  # Import Local
23
- from App_Function_Libraries.Audio_Transcription_Lib import convert_to_wav
24
  from App_Function_Libraries.Video_DL_Ingestion_Lib import *
25
  from App_Function_Libraries.Video_DL_Ingestion_Lib import get_youtube
26
- from App_Function_Libraries.Utils import normalize_title, create_download_directory
27
 
28
  #######################################################################################################################
29
  # Function Definitions
 
20
 
21
  # Import necessary libraries
22
  # Import Local
23
+ from App_Function_Libraries.Audio.Audio_Transcription_Lib import convert_to_wav
24
  from App_Function_Libraries.Video_DL_Ingestion_Lib import *
25
  from App_Function_Libraries.Video_DL_Ingestion_Lib import get_youtube
26
+ from App_Function_Libraries.Utils.Utils import normalize_title, create_download_directory
27
 
28
  #######################################################################################################################
29
  # Function Definitions
App_Function_Libraries/Local_LLM_Inference_Engine_Lib.py CHANGED
@@ -27,11 +27,11 @@ import subprocess
27
  import sys
28
  import time
29
 
30
- from App_Function_Libraries.Utils import download_file
31
  # Import 3rd-pary Libraries
32
  #
33
  # Import Local
34
- from Article_Summarization_Lib import *
35
 
36
  #
37
  #
 
27
  import sys
28
  import time
29
 
30
+ from App_Function_Libraries.Utils.Utils import download_file
31
  # Import 3rd-pary Libraries
32
  #
33
  # Import Local
34
+ from App_Function_Libraries.Web_Scraping.Article_Summarization_Lib import *
35
 
36
  #
37
  #
App_Function_Libraries/MediaWiki/Media_Wiki.py CHANGED
@@ -7,6 +7,7 @@ import json
7
  import logging
8
  import os
9
  import re
 
10
  from typing import List, Dict, Any, Iterator, Optional
11
  # 3rd-Party Imports
12
  import mwparserfromhell
@@ -14,12 +15,19 @@ import mwxml
14
  import yaml
15
  #
16
  # Local Imports
17
- from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords, check_media_exists
18
  from App_Function_Libraries.RAG.ChromaDB_Library import process_and_store_content
19
  #
20
  #######################################################################################################################
21
  #
22
  # Functions:
 
 
 
 
 
 
 
23
 
24
  def setup_logger(name: str, level: int = logging.INFO, log_file: Optional[str] = None) -> logging.Logger:
25
  """Set up and return a logger with the given name and level."""
@@ -41,11 +49,11 @@ def setup_logger(name: str, level: int = logging.INFO, log_file: Optional[str] =
41
  # Usage
42
  logger = setup_logger('mediawiki_import', log_file='mediawiki_import.log')
43
 
44
- # Load configuration
45
- def load_mediawiki_import_config():
46
- with open(os.path.join('Config_Files', 'mediawiki_import_config.yaml'), 'r') as f:
47
- return yaml.safe_load(f)
48
- config = load_mediawiki_import_config()
49
 
50
  def parse_mediawiki_dump(file_path: str, namespaces: List[int] = None, skip_redirects: bool = False) -> Iterator[
51
  Dict[str, Any]]:
@@ -57,11 +65,11 @@ def parse_mediawiki_dump(file_path: str, namespaces: List[int] = None, skip_redi
57
  continue
58
 
59
  for revision in page:
60
- code = mwparserfromhell.parse(revision.text)
61
- text = code.strip_code(normalize=True, collapse=True, keep_template_params=False)
62
  yield {
63
  "title": page.title,
64
- "content": text,
65
  "namespace": page.namespace,
66
  "page_id": page.id,
67
  "revision_id": revision.id,
@@ -76,6 +84,7 @@ def optimized_chunking(text: str, chunk_options: Dict[str, Any]) -> List[Dict[st
76
  current_chunk = ""
77
  current_size = 0
78
 
 
79
  for i in range(0, len(sections), 2):
80
  section_title = sections[i] if i > 0 else "Introduction"
81
  section_content = sections[i + 1] if i + 1 < len(sections) else ""
@@ -95,33 +104,54 @@ def optimized_chunking(text: str, chunk_options: Dict[str, Any]) -> List[Dict[st
95
  return chunks
96
 
97
 
 
 
 
98
  def process_single_item(content: str, title: str, wiki_name: str, chunk_options: Dict[str, Any],
99
- is_combined: bool = False, item: Dict[str, Any] = None):
100
  try:
101
- url = f"mediawiki:{wiki_name}" if is_combined else f"mediawiki:{wiki_name}:{title}"
102
-
103
- if not check_media_exists(title, url):
104
- media_id = add_media_with_keywords(
105
- url=url,
106
- title=title,
107
- media_type="mediawiki_dump" if is_combined else "mediawiki_article",
108
- content=content,
109
- keywords=f"mediawiki,{wiki_name}" + (",full_dump" if is_combined else ",article"),
110
- prompt="",
111
- summary="",
112
- transcription_model="",
113
- author="MediaWiki",
114
- ingestion_date=item['timestamp'].strftime('%Y-%m-%d') if item else None
115
- )
116
-
117
- chunks = optimized_chunking(content, chunk_options)
118
- for chunk in chunks:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  process_and_store_content(chunk['text'], f"mediawiki_{wiki_name}", media_id, title)
120
- logger.info(f"Successfully processed item: {title}")
121
- else:
122
- logger.info(f"Skipping existing article: {title}")
123
  except Exception as e:
124
- logger.error(f"Error processing item {title}: {str(e)}")
 
125
 
126
 
127
  def load_checkpoint(file_path: str) -> int:
@@ -143,9 +173,12 @@ def import_mediawiki_dump(
143
  skip_redirects: bool = False,
144
  chunk_options: Dict[str, Any] = None,
145
  single_item: bool = False,
146
- progress_callback: Any = None
 
 
147
  ) -> Iterator[str]:
148
  try:
 
149
  if chunk_options is None:
150
  chunk_options = config['chunking']
151
 
@@ -160,6 +193,10 @@ def import_mediawiki_dump(
160
  for item in parse_mediawiki_dump(file_path, namespaces, skip_redirects):
161
  if item['page_id'] <= last_processed_id:
162
  continue
 
 
 
 
163
  process_single_item(item['content'], item['title'], wiki_name, chunk_options, False, item)
164
  save_checkpoint(checkpoint_file, item['page_id'])
165
  processed_pages += 1
 
7
  import logging
8
  import os
9
  import re
10
+ import traceback
11
  from typing import List, Dict, Any, Iterator, Optional
12
  # 3rd-Party Imports
13
  import mwparserfromhell
 
15
  import yaml
16
  #
17
  # Local Imports
18
+ from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords
19
  from App_Function_Libraries.RAG.ChromaDB_Library import process_and_store_content
20
  #
21
  #######################################################################################################################
22
  #
23
  # Functions:
24
+ # Load configuration
25
+ def load_mediawiki_import_config():
26
+ with open(os.path.join('Config_Files', 'mediawiki_import_config.yaml'), 'r') as f:
27
+ return yaml.safe_load(f)
28
+
29
+ config = load_mediawiki_import_config()
30
+
31
 
32
  def setup_logger(name: str, level: int = logging.INFO, log_file: Optional[str] = None) -> logging.Logger:
33
  """Set up and return a logger with the given name and level."""
 
49
  # Usage
50
  logger = setup_logger('mediawiki_import', log_file='mediawiki_import.log')
51
 
52
+ # End of setup
53
+ #######################################################################################################################
54
+ #
55
+ # Functions:
56
+
57
 
58
  def parse_mediawiki_dump(file_path: str, namespaces: List[int] = None, skip_redirects: bool = False) -> Iterator[
59
  Dict[str, Any]]:
 
65
  continue
66
 
67
  for revision in page:
68
+ wikicode = mwparserfromhell.parse(revision.text)
69
+ plain_text = wikicode.strip_code()
70
  yield {
71
  "title": page.title,
72
+ "content": plain_text,
73
  "namespace": page.namespace,
74
  "page_id": page.id,
75
  "revision_id": revision.id,
 
84
  current_chunk = ""
85
  current_size = 0
86
 
87
+ logging.debug(f"optimized_chunking: Processing text with {len(sections) // 2} sections")
88
  for i in range(0, len(sections), 2):
89
  section_title = sections[i] if i > 0 else "Introduction"
90
  section_content = sections[i + 1] if i + 1 < len(sections) else ""
 
104
  return chunks
105
 
106
 
107
+
108
+
109
+
110
  def process_single_item(content: str, title: str, wiki_name: str, chunk_options: Dict[str, Any],
111
+ is_combined: bool = False, item: Dict[str, Any] = None, api_name: str = None):
112
  try:
113
+ logging.debug(f"process_single_item: Processing item: {title}")
114
+
115
+ # Create a unique URL using the wiki name and article title
116
+ encoded_title = title.replace(" ", "_")
117
+ url = f"mediawiki:{wiki_name}:{encoded_title}"
118
+ logging.debug(f"Generated URL: {url}")
119
+
120
+ result = add_media_with_keywords(
121
+ url=url, # Use the generated URL here
122
+ title=title,
123
+ media_type="mediawiki_dump" if is_combined else "mediawiki_article",
124
+ content=content,
125
+ keywords=f"mediawiki,{wiki_name}" + (",full_dump" if is_combined else ",article"),
126
+ prompt="",
127
+ summary="",
128
+ transcription_model="",
129
+ author="MediaWiki",
130
+ ingestion_date=item['timestamp'].strftime('%Y-%m-%d') if item else None
131
+ )
132
+ logging.debug(f"Result from add_media_with_keywords: {result}")
133
+
134
+ # Unpack the result
135
+ media_id, message = result
136
+ logging.info(f"Media item result: {message}")
137
+ logging.debug(f"Final media_id: {media_id}")
138
+
139
+ chunks = optimized_chunking(content, chunk_options)
140
+ for i, chunk in enumerate(chunks):
141
+ logging.debug(f"Processing chunk {i + 1}/{len(chunks)} for item: {title}")
142
+
143
+ # FIXME
144
+ # def process_and_store_content(content: str, collection_name: str, media_id: int, file_name: str,
145
+ # create_embeddings: bool = False, create_summary: bool = False,
146
+ # api_name: str = None):
147
+ if api_name:
148
+ process_and_store_content(chunk['text'], f"mediawiki_{wiki_name}", media_id, title, True, True, api_name)
149
+ else:
150
  process_and_store_content(chunk['text'], f"mediawiki_{wiki_name}", media_id, title)
151
+ logging.info(f"Successfully processed item: {title}")
 
 
152
  except Exception as e:
153
+ logging.error(f"Error processing item {title}: {str(e)}")
154
+ logging.error(f"Exception details: {traceback.format_exc()}")
155
 
156
 
157
  def load_checkpoint(file_path: str) -> int:
 
173
  skip_redirects: bool = False,
174
  chunk_options: Dict[str, Any] = None,
175
  single_item: bool = False,
176
+ progress_callback: Any = None,
177
+ api_name: str = None,
178
+ api_key: str = None
179
  ) -> Iterator[str]:
180
  try:
181
+ logging.info(f"Importing MediaWiki dump: {file_path}")
182
  if chunk_options is None:
183
  chunk_options = config['chunking']
184
 
 
193
  for item in parse_mediawiki_dump(file_path, namespaces, skip_redirects):
194
  if item['page_id'] <= last_processed_id:
195
  continue
196
+ # FIXME - ensure this works...
197
+ if api_name is not None:
198
+ # FIXME - add API key to the call/params
199
+ process_single_item(item['content'], item['title'], wiki_name, chunk_options, False, item, api_name)
200
  process_single_item(item['content'], item['title'], wiki_name, chunk_options, False, item)
201
  save_checkpoint(checkpoint_file, item['page_id'])
202
  processed_pages += 1
App_Function_Libraries/PDF/PDF_Ingestion_Lib.py ADDED
@@ -0,0 +1,318 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PDF_Ingestion_Lib.py
2
+ #########################################
3
+ # Library to hold functions for ingesting PDF files.#
4
+ #
5
+ ####################
6
+ # Function List
7
+ #
8
+ # 1. convert_pdf_to_markdown(pdf_path)
9
+ # 2. ingest_pdf_file(file_path, title=None, author=None, keywords=None):
10
+ # 3.
11
+ #
12
+ #
13
+ ####################
14
+ import re
15
+
16
+ # Import necessary libraries
17
+
18
+
19
+ # Import Local
20
+
21
+ #######################################################################################################################
22
+ # Function Definitions
23
+ #
24
+
25
+ # Ingest a text file into the database with Title/Author/Keywords
26
+
27
+
28
+ # Constants
29
+ MAX_FILE_SIZE_MB = 50
30
+ CONVERSION_TIMEOUT_SECONDS = 300
31
+
32
+ # Marker PDF solution
33
+ # def convert_pdf_to_markdown(pdf_path):
34
+ # """
35
+ # Convert a PDF file to Markdown by calling a script in another virtual environment.
36
+ # """
37
+ #
38
+ # logging.debug(f"Marker: Converting PDF file to Markdown: {pdf_path}")
39
+ # # Check if the file size exceeds the maximum allowed size
40
+ # file_size_mb = os.path.getsize(pdf_path) / (1024 * 1024)
41
+ # if file_size_mb > MAX_FILE_SIZE_MB:
42
+ # raise ValueError(f"File size ({file_size_mb:.2f} MB) exceeds the maximum allowed size of {MAX_FILE_SIZE_MB} MB")
43
+ #
44
+ # logging.debug("Marker: Converting PDF file to Markdown using Marker virtual environment")
45
+ # # Path to the Python interpreter in the other virtual environment
46
+ # other_venv_python = "Helper_Scripts/marker_venv/bin/python"
47
+ #
48
+ # # Path to the conversion script
49
+ # converter_script = "Helper_Scripts/PDF_Converter.py"
50
+ #
51
+ # logging.debug("Marker: Attempting to convert PDF file to Markdown...")
52
+ # try:
53
+ # result = subprocess.run(
54
+ # [other_venv_python, converter_script, pdf_path],
55
+ # capture_output=True,
56
+ # text=True,
57
+ # timeout=CONVERSION_TIMEOUT_SECONDS
58
+ # )
59
+ # if result.returncode != 0:
60
+ # raise Exception(f"Conversion failed: {result.stderr}")
61
+ # return result.stdout
62
+ # except subprocess.TimeoutExpired:
63
+ # raise Exception(f"PDF conversion timed out after {CONVERSION_TIMEOUT_SECONDS} seconds")
64
+ #
65
+ #
66
+ # def process_and_ingest_pdf(file, title, author, keywords):
67
+ # if file is None:
68
+ # return "Please select a PDF file to upload."
69
+ #
70
+ # try:
71
+ # # Create a temporary directory
72
+ # with tempfile.TemporaryDirectory() as temp_dir:
73
+ # # Create a path for the temporary PDF file
74
+ # temp_path = os.path.join(temp_dir, "temp.pdf")
75
+ #
76
+ # # Copy the contents of the uploaded file to the temporary file
77
+ # shutil.copy(file.name, temp_path)
78
+ #
79
+ # # Call the ingest_pdf_file function with the temporary file path
80
+ # result = ingest_pdf_file(temp_path, title, author, keywords)
81
+ #
82
+ # return result
83
+ # except Exception as e:
84
+ # return f"Error processing PDF: {str(e)}"
85
+ #
86
+ #
87
+ # def ingest_pdf_file(file_path, title=None, author=None, keywords=None):
88
+ # try:
89
+ # # Convert PDF to Markdown
90
+ # markdown_content = convert_pdf_to_markdown(file_path)
91
+ #
92
+ # # If title is not provided, use the filename without extension
93
+ # if not title:
94
+ # title = os.path.splitext(os.path.basename(file_path))[0]
95
+ #
96
+ # # If author is not provided, set it to 'Unknown'
97
+ # if not author:
98
+ # author = 'Unknown'
99
+ #
100
+ # # If keywords are not provided, use a default keyword
101
+ # if not keywords:
102
+ # keywords = 'pdf_file,markdown_converted'
103
+ # else:
104
+ # keywords = f'pdf_file,markdown_converted,{keywords}'
105
+ #
106
+ # # Add the markdown content to the database
107
+ # add_media_with_keywords(
108
+ # url=file_path,
109
+ # title=title,
110
+ # media_type='document',
111
+ # content=markdown_content,
112
+ # keywords=keywords,
113
+ # prompt='No prompt for PDF files',
114
+ # summary='No summary for PDF files',
115
+ # transcription_model='None',
116
+ # author=author,
117
+ # ingestion_date=datetime.now().strftime('%Y-%m-%d')
118
+ # )
119
+ #
120
+ # return f"PDF file '{title}' converted to Markdown and ingested successfully.", file_path
121
+ # except ValueError as e:
122
+ # logging.error(f"File size error: {str(e)}")
123
+ # return f"Error: {str(e)}", file_path
124
+ # except Exception as e:
125
+ # logging.error(f"Error ingesting PDF file: {str(e)}")
126
+ # return f"Error ingesting PDF file: {str(e)}", file_path
127
+ #
128
+ #
129
+ # def process_and_cleanup_pdf(file, title, author, keywords):
130
+ # # FIXME - Update to validate file upload/filetype is pdf....
131
+ # if file is None:
132
+ # return "No file uploaded. Please upload a PDF file."
133
+ #
134
+ # temp_dir = tempfile.mkdtemp()
135
+ # temp_file_path = os.path.join(temp_dir, "temp.pdf")
136
+ #
137
+ # try:
138
+ # # Copy the uploaded file to a temporary location
139
+ # shutil.copy2(file.name, temp_file_path)
140
+ #
141
+ # # Process the file
142
+ # result, _ = ingest_pdf_file(temp_file_path, title, author, keywords)
143
+ #
144
+ # return result
145
+ # except Exception as e:
146
+ # logging.error(f"Error in processing and cleanup: {str(e)}")
147
+ # return f"Error: {str(e)}"
148
+ # finally:
149
+ # # Clean up the temporary directory and its contents
150
+ # try:
151
+ # shutil.rmtree(temp_dir)
152
+ # logging.info(f"Removed temporary directory: {temp_dir}")
153
+ # except Exception as cleanup_error:
154
+ # logging.error(f"Error during cleanup: {str(cleanup_error)}")
155
+ # result += f"\nWarning: Could not remove temporary files: {str(cleanup_error)}"
156
+
157
+
158
+ import logging
159
+ #
160
+ #
161
+ #######################################################################################################################
162
+ #
163
+ # Non-Marker implementation
164
+ import os
165
+ import shutil
166
+ import tempfile
167
+ from datetime import datetime
168
+
169
+ import pymupdf
170
+
171
+ from App_Function_Libraries.DB.DB_Manager import add_media_with_keywords
172
+
173
+
174
+ def extract_text_and_format_from_pdf(pdf_path):
175
+ """
176
+ Extract text from a PDF file and convert it to Markdown, preserving formatting.
177
+ """
178
+ try:
179
+ markdown_text = ""
180
+ with pymupdf.open(pdf_path) as doc:
181
+ for page_num, page in enumerate(doc, 1):
182
+ markdown_text += f"## Page {page_num}\n\n"
183
+ blocks = page.get_text("dict")["blocks"]
184
+ current_paragraph = ""
185
+ for block in blocks:
186
+ if block["type"] == 0: # Text block
187
+ for line in block["lines"]:
188
+ line_text = ""
189
+ for span in line["spans"]:
190
+ text = span["text"]
191
+ font_size = span["size"]
192
+ font_flags = span["flags"]
193
+
194
+ # Apply formatting based on font size and flags
195
+ if font_size > 20:
196
+ text = f"# {text}"
197
+ elif font_size > 16:
198
+ text = f"## {text}"
199
+ elif font_size > 14:
200
+ text = f"### {text}"
201
+
202
+ if font_flags & 2 ** 0: # Bold
203
+ text = f"**{text}**"
204
+ if font_flags & 2 ** 1: # Italic
205
+ text = f"*{text}*"
206
+
207
+ line_text += text + " "
208
+
209
+ # Remove hyphens at the end of lines
210
+ line_text = line_text.rstrip()
211
+ if line_text.endswith('-'):
212
+ line_text = line_text[:-1]
213
+ else:
214
+ line_text += " "
215
+
216
+ current_paragraph += line_text
217
+
218
+ # End of block, add paragraph
219
+ if current_paragraph:
220
+ # Remove extra spaces
221
+ current_paragraph = re.sub(r'\s+', ' ', current_paragraph).strip()
222
+ markdown_text += current_paragraph + "\n\n"
223
+ current_paragraph = ""
224
+ elif block["type"] == 1: # Image block
225
+ markdown_text += "[Image]\n\n"
226
+ markdown_text += "\n---\n\n" # Page separator
227
+
228
+ # Clean up hyphenated words
229
+ markdown_text = re.sub(r'(\w+)-\s*\n(\w+)', r'\1\2', markdown_text)
230
+
231
+ return markdown_text
232
+ except Exception as e:
233
+ logging.error(f"Error extracting text and formatting from PDF: {str(e)}")
234
+ raise
235
+
236
+
237
+ def extract_metadata_from_pdf(pdf_path):
238
+ """
239
+ Extract metadata from a PDF file using PyMuPDF.
240
+ """
241
+ try:
242
+ with pymupdf.open(pdf_path) as doc:
243
+ metadata = doc.metadata
244
+ return metadata
245
+ except Exception as e:
246
+ logging.error(f"Error extracting metadata from PDF: {str(e)}")
247
+ return {}
248
+
249
+
250
+ def process_and_ingest_pdf(file, title, author, keywords):
251
+ if file is None:
252
+ return "Please select a PDF file to upload."
253
+
254
+ try:
255
+ # Create a temporary directory
256
+ with tempfile.TemporaryDirectory() as temp_dir:
257
+ # Create a path for the temporary PDF file
258
+ temp_path = os.path.join(temp_dir, "temp.pdf")
259
+
260
+ # Copy the contents of the uploaded file to the temporary file
261
+ shutil.copy(file.name, temp_path)
262
+
263
+ # Extract text and convert to Markdown
264
+ markdown_text = extract_text_and_format_from_pdf(temp_path)
265
+
266
+ # Extract metadata from PDF
267
+ metadata = extract_metadata_from_pdf(temp_path)
268
+
269
+ # Use metadata for title and author if not provided
270
+ if not title:
271
+ title = metadata.get('title', os.path.splitext(os.path.basename(file.name))[0])
272
+ if not author:
273
+ author = metadata.get('author', 'Unknown')
274
+
275
+ # If keywords are not provided, use a default keyword
276
+ if not keywords:
277
+ keywords = 'pdf_file,markdown_converted'
278
+ else:
279
+ keywords = f'pdf_file,markdown_converted,{keywords}'
280
+
281
+ # Add metadata-based keywords
282
+ if 'subject' in metadata:
283
+ keywords += f",{metadata['subject']}"
284
+
285
+ # Add the PDF content to the database
286
+ add_media_with_keywords(
287
+ url=file.name,
288
+ title=title,
289
+ media_type='document',
290
+ content=markdown_text,
291
+ keywords=keywords,
292
+ prompt='No prompt for PDF files',
293
+ summary='No summary for PDF files',
294
+ transcription_model='None',
295
+ author=author,
296
+ ingestion_date=datetime.now().strftime('%Y-%m-%d')
297
+ )
298
+
299
+ return f"PDF file '{title}' by {author} ingested successfully and converted to Markdown."
300
+ except Exception as e:
301
+ logging.error(f"Error ingesting PDF file: {str(e)}")
302
+ return f"Error ingesting PDF file: {str(e)}"
303
+
304
+
305
+ def process_and_cleanup_pdf(file, title, author, keywords):
306
+ if file is None:
307
+ return "No file uploaded. Please upload a PDF file."
308
+
309
+ try:
310
+ result = process_and_ingest_pdf(file, title, author, keywords)
311
+ return result
312
+ except Exception as e:
313
+ logging.error(f"Error in processing and cleanup: {str(e)}")
314
+ return f"Error: {str(e)}"
315
+
316
+ #
317
+ # End of PDF_Ingestion_Lib.py
318
+ #######################################################################################################################
App_Function_Libraries/PDF/__init__.py ADDED
File without changes
App_Function_Libraries/Summarization/Summarization_General_Lib.py CHANGED
@@ -25,10 +25,10 @@ from typing import Optional
25
  import requests
26
  from requests import RequestException
27
 
28
- from App_Function_Libraries.Audio_Transcription_Lib import convert_to_wav, speech_to_text
29
  from App_Function_Libraries.Chunk_Lib import semantic_chunking, rolling_summarize, recursive_summarize_chunks, \
30
  improved_chunking_process
31
- from App_Function_Libraries.Diarization_Lib import combine_transcription_and_diarization
32
  from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_llama, summarize_with_kobold, \
33
  summarize_with_oobabooga, summarize_with_tabbyapi, summarize_with_vllm, summarize_with_local_llm
34
  from App_Function_Libraries.DB.DB_Manager import add_media_to_database
 
25
  import requests
26
  from requests import RequestException
27
 
28
+ from App_Function_Libraries.Audio.Audio_Transcription_Lib import convert_to_wav, speech_to_text
29
  from App_Function_Libraries.Chunk_Lib import semantic_chunking, rolling_summarize, recursive_summarize_chunks, \
30
  improved_chunking_process
31
+ from App_Function_Libraries.Audio.Diarization_Lib import combine_transcription_and_diarization
32
  from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_llama, summarize_with_kobold, \
33
  summarize_with_oobabooga, summarize_with_tabbyapi, summarize_with_vllm, summarize_with_local_llm
34
  from App_Function_Libraries.DB.DB_Manager import add_media_to_database
App_Function_Libraries/Video_DL_Ingestion_Lib.py CHANGED
@@ -1,331 +1,332 @@
1
- # Video_DL_Ingestion_Lib.py
2
- #########################################
3
- # Video Downloader and Ingestion Library
4
- # This library is used to handle downloading videos from YouTube and other platforms.
5
- # It also handles the ingestion of the videos into the database.
6
- # It uses yt-dlp to extract video information and download the videos.
7
- ####
8
- import json
9
- ####################
10
- # Function List
11
- #
12
- # 1. get_video_info(url)
13
- # 2. create_download_directory(title)
14
- # 3. sanitize_filename(title)
15
- # 4. normalize_title(title)
16
- # 5. get_youtube(video_url)
17
- # 6. get_playlist_videos(playlist_url)
18
- # 7. download_video(video_url, download_path, info_dict, download_video_flag)
19
- # 8. save_to_file(video_urls, filename)
20
- # 9. save_summary_to_file(summary, file_path)
21
- # 10. process_url(url, num_speakers, whisper_model, custom_prompt, offset, api_name, api_key, vad_filter, download_video, download_audio, rolling_summarization, detail_level, question_box, keywords, chunk_summarization, chunk_duration_input, words_per_second_input)
22
- #
23
- #
24
- ####################
25
- # Import necessary libraries to run solo for testing
26
- import logging
27
- import os
28
- import re
29
- import sys
30
- from urllib.parse import urlparse, parse_qs
31
-
32
- import unicodedata
33
- # 3rd-Party Imports
34
- import yt_dlp
35
-
36
- from App_Function_Libraries.DB.DB_Manager import check_media_and_whisper_model
37
-
38
-
39
- # Import Local
40
- #
41
- #######################################################################################################################
42
- # Function Definitions
43
- #
44
-
45
- def normalize_title(title):
46
- # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
47
- title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
48
- title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '').replace('*', '').replace('?',
49
- '').replace(
50
- '<', '').replace('>', '').replace('|', '')
51
- return title
52
-
53
- def get_video_info(url: str) -> dict:
54
- ydl_opts = {
55
- 'quiet': True,
56
- 'no_warnings': True,
57
- 'skip_download': True,
58
- }
59
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
60
- try:
61
- info_dict = ydl.extract_info(url, download=False)
62
- return info_dict
63
- except Exception as e:
64
- logging.error(f"Error extracting video info: {e}")
65
- return None
66
-
67
-
68
- def get_youtube(video_url):
69
- ydl_opts = {
70
- 'format': 'bestaudio[ext=m4a]',
71
- 'noplaylist': False,
72
- 'quiet': True,
73
- 'extract_flat': True
74
- }
75
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
76
- logging.debug("About to extract youtube info")
77
- info_dict = ydl.extract_info(video_url, download=False)
78
- logging.debug("Youtube info successfully extracted")
79
- return info_dict
80
-
81
-
82
- def get_playlist_videos(playlist_url):
83
- ydl_opts = {
84
- 'extract_flat': True,
85
- 'skip_download': True,
86
- 'quiet': True
87
- }
88
-
89
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
90
- info = ydl.extract_info(playlist_url, download=False)
91
-
92
- if 'entries' in info:
93
- video_urls = [entry['url'] for entry in info['entries']]
94
- playlist_title = info['title']
95
- return video_urls, playlist_title
96
- else:
97
- print("No videos found in the playlist.")
98
- return [], None
99
-
100
-
101
- def download_video(video_url, download_path, info_dict, download_video_flag, current_whisper_model):
102
- global video_file_path, ffmpeg_path
103
- global audio_file_path
104
-
105
- # Normalize Video Title name
106
- logging.debug("About to normalize downloaded video title")
107
- if 'title' not in info_dict or 'ext' not in info_dict:
108
- logging.error("info_dict is missing 'title' or 'ext'")
109
- return None
110
-
111
- normalized_video_title = normalize_title(info_dict['title'])
112
-
113
- # Check if media already exists in the database and compare whisper models
114
- should_download, reason = check_media_and_whisper_model(
115
- title=normalized_video_title,
116
- url=video_url,
117
- current_whisper_model=current_whisper_model
118
- )
119
-
120
- if not should_download:
121
- logging.info(f"Skipping download: {reason}")
122
- return None
123
-
124
- logging.info(f"Proceeding with download: {reason}")
125
-
126
- video_file_path = os.path.join(download_path, f"{normalized_video_title}.{info_dict['ext']}")
127
-
128
- # Check for existence of video file
129
- if os.path.exists(video_file_path):
130
- logging.info(f"Video file already exists: {video_file_path}")
131
- return video_file_path
132
-
133
- # Setup path handling for ffmpeg on different OSs
134
- if sys.platform.startswith('win'):
135
- ffmpeg_path = os.path.join(os.getcwd(), 'Bin', 'ffmpeg.exe')
136
- elif sys.platform.startswith('linux'):
137
- ffmpeg_path = 'ffmpeg'
138
- elif sys.platform.startswith('darwin'):
139
- ffmpeg_path = 'ffmpeg'
140
-
141
- if download_video_flag:
142
- video_file_path = os.path.join(download_path, f"{normalized_video_title}.mp4")
143
- ydl_opts_video = {
144
- 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]',
145
- 'outtmpl': video_file_path,
146
- 'ffmpeg_location': ffmpeg_path
147
- }
148
-
149
- try:
150
- with yt_dlp.YoutubeDL(ydl_opts_video) as ydl:
151
- logging.debug("yt_dlp: About to download video with youtube-dl")
152
- ydl.download([video_url])
153
- logging.debug("yt_dlp: Video successfully downloaded with youtube-dl")
154
- if os.path.exists(video_file_path):
155
- return video_file_path
156
- else:
157
- logging.error("yt_dlp: Video file not found after download")
158
- return None
159
- except Exception as e:
160
- logging.error(f"yt_dlp: Error downloading video: {e}")
161
- return None
162
- elif not download_video_flag:
163
- video_file_path = os.path.join(download_path, f"{normalized_video_title}.mp4")
164
- # Set options for video and audio
165
- ydl_opts = {
166
- 'format': 'bestaudio[ext=m4a]',
167
- 'quiet': True,
168
- 'outtmpl': video_file_path
169
- }
170
-
171
- try:
172
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
173
- logging.debug("yt_dlp: About to download video with youtube-dl")
174
- ydl.download([video_url])
175
- logging.debug("yt_dlp: Video successfully downloaded with youtube-dl")
176
- if os.path.exists(video_file_path):
177
- return video_file_path
178
- else:
179
- logging.error("yt_dlp: Video file not found after download")
180
- return None
181
- except Exception as e:
182
- logging.error(f"yt_dlp: Error downloading video: {e}")
183
- return None
184
-
185
- else:
186
- logging.debug("download_video: Download video flag is set to False and video file path is not found")
187
- return None
188
-
189
-
190
- def extract_video_info(url):
191
- try:
192
- with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
193
- info = ydl.extract_info(url, download=False)
194
-
195
- # Log only a subset of the info to avoid overwhelming the logs
196
- log_info = {
197
- 'title': info.get('title'),
198
- 'duration': info.get('duration'),
199
- 'upload_date': info.get('upload_date')
200
- }
201
- logging.debug(f"Extracted info for {url}: {log_info}")
202
-
203
- return info
204
- except Exception as e:
205
- logging.error(f"Error extracting video info for {url}: {str(e)}", exc_info=True)
206
- return None
207
-
208
-
209
- def get_youtube_playlist_urls(playlist_id):
210
- ydl_opts = {
211
- 'extract_flat': True,
212
- 'quiet': True,
213
- }
214
-
215
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
216
- result = ydl.extract_info(f'https://www.youtube.com/playlist?list={playlist_id}', download=False)
217
- return [entry['url'] for entry in result['entries'] if entry.get('url')]
218
-
219
-
220
- def parse_and_expand_urls(urls):
221
- logging.info(f"Starting parse_and_expand_urls with input: {urls}")
222
- expanded_urls = []
223
-
224
- for url in urls:
225
- try:
226
- logging.info(f"Processing URL: {url}")
227
- parsed_url = urlparse(url)
228
- logging.debug(f"Parsed URL components: {parsed_url}")
229
-
230
- # YouTube playlist handling
231
- if 'youtube.com' in parsed_url.netloc and 'list' in parsed_url.query:
232
- playlist_id = parse_qs(parsed_url.query)['list'][0]
233
- logging.info(f"Detected YouTube playlist with ID: {playlist_id}")
234
- playlist_urls = get_youtube_playlist_urls(playlist_id)
235
- logging.info(f"Expanded playlist URLs: {playlist_urls}")
236
- expanded_urls.extend(playlist_urls)
237
-
238
- # YouTube short URL handling
239
- elif 'youtu.be' in parsed_url.netloc:
240
- video_id = parsed_url.path.lstrip('/')
241
- full_url = f'https://www.youtube.com/watch?v={video_id}'
242
- logging.info(f"Expanded YouTube short URL to: {full_url}")
243
- expanded_urls.append(full_url)
244
-
245
- # Vimeo handling
246
- elif 'vimeo.com' in parsed_url.netloc:
247
- video_id = parsed_url.path.lstrip('/')
248
- full_url = f'https://vimeo.com/{video_id}'
249
- logging.info(f"Processed Vimeo URL: {full_url}")
250
- expanded_urls.append(full_url)
251
-
252
- # Add more platform-specific handling here
253
-
254
- else:
255
- logging.info(f"URL not recognized as special case, adding as-is: {url}")
256
- expanded_urls.append(url)
257
-
258
- except Exception as e:
259
- logging.error(f"Error processing URL {url}: {str(e)}", exc_info=True)
260
- # Optionally, you might want to add the problematic URL to expanded_urls
261
- # expanded_urls.append(url)
262
-
263
- logging.info(f"Final expanded URLs: {expanded_urls}")
264
- return expanded_urls
265
-
266
-
267
- def extract_metadata(url, use_cookies=False, cookies=None):
268
- ydl_opts = {
269
- 'quiet': True,
270
- 'no_warnings': True,
271
- 'extract_flat': True,
272
- 'skip_download': True,
273
- }
274
-
275
- if use_cookies and cookies:
276
- try:
277
- cookie_dict = json.loads(cookies)
278
- ydl_opts['cookiefile'] = cookie_dict
279
- except json.JSONDecodeError:
280
- logging.warning("Invalid cookie format. Proceeding without cookies.")
281
-
282
- with yt_dlp.YoutubeDL(ydl_opts) as ydl:
283
- try:
284
- info = ydl.extract_info(url, download=False)
285
- metadata = {
286
- 'title': info.get('title'),
287
- 'uploader': info.get('uploader'),
288
- 'upload_date': info.get('upload_date'),
289
- 'view_count': info.get('view_count'),
290
- 'like_count': info.get('like_count'),
291
- 'duration': info.get('duration'),
292
- 'tags': info.get('tags'),
293
- 'description': info.get('description')
294
- }
295
-
296
- # Create a safe subset of metadata to log
297
- safe_metadata = {
298
- 'title': metadata.get('title', 'No title'),
299
- 'duration': metadata.get('duration', 'Unknown duration'),
300
- 'upload_date': metadata.get('upload_date', 'Unknown upload date'),
301
- 'uploader': metadata.get('uploader', 'Unknown uploader')
302
- }
303
-
304
- logging.info(f"Successfully extracted metadata for {url}: {safe_metadata}")
305
- return metadata
306
- except Exception as e:
307
- logging.error(f"Error extracting metadata for {url}: {str(e)}", exc_info=True)
308
- return None
309
-
310
-
311
- def generate_timestamped_url(url, hours, minutes, seconds):
312
- # Extract video ID from the URL
313
- video_id_match = re.search(r'(?:v=|)([0-9A-Za-z_-]{11}).*', url)
314
- if not video_id_match:
315
- return "Invalid YouTube URL"
316
-
317
- video_id = video_id_match.group(1)
318
-
319
- # Calculate total seconds
320
- total_seconds = int(hours) * 3600 + int(minutes) * 60 + int(seconds)
321
-
322
- # Generate the new URL
323
- new_url = f"https://www.youtube.com/watch?v={video_id}&t={total_seconds}s"
324
-
325
- return new_url
326
-
327
-
328
-
329
- #
330
- #
331
- #######################################################################################################################
 
 
1
+ # Video_DL_Ingestion_Lib.py
2
+ #########################################
3
+ # Video Downloader and Ingestion Library
4
+ # This library is used to handle downloading videos from YouTube and other platforms.
5
+ # It also handles the ingestion of the videos into the database.
6
+ # It uses yt-dlp to extract video information and download the videos.
7
+ ####
8
+ import json
9
+ ####################
10
+ # Function List
11
+ #
12
+ # 1. get_video_info(url)
13
+ # 2. create_download_directory(title)
14
+ # 3. sanitize_filename(title)
15
+ # 4. normalize_title(title)
16
+ # 5. get_youtube(video_url)
17
+ # 6. get_playlist_videos(playlist_url)
18
+ # 7. download_video(video_url, download_path, info_dict, download_video_flag)
19
+ # 8. save_to_file(video_urls, filename)
20
+ # 9. save_summary_to_file(summary, file_path)
21
+ # 10. process_url(url, num_speakers, whisper_model, custom_prompt, offset, api_name, api_key, vad_filter, download_video, download_audio, rolling_summarization, detail_level, question_box, keywords, chunk_summarization, chunk_duration_input, words_per_second_input)
22
+ #
23
+ #
24
+ ####################
25
+ # Import necessary libraries to run solo for testing
26
+ import logging
27
+ import os
28
+ import re
29
+ import sys
30
+ from urllib.parse import urlparse, parse_qs
31
+
32
+ import unicodedata
33
+ # 3rd-Party Imports
34
+ import yt_dlp
35
+
36
+ from App_Function_Libraries.DB.DB_Manager import check_media_and_whisper_model
37
+
38
+
39
+ # Import Local
40
+ #
41
+ #######################################################################################################################
42
+ # Function Definitions
43
+ #
44
+
45
+ def normalize_title(title):
46
+ # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
47
+ title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
48
+ title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '').replace('*', '').replace('?',
49
+ '').replace(
50
+ '<', '').replace('>', '').replace('|', '')
51
+ return title
52
+
53
+ def get_video_info(url: str) -> dict:
54
+ ydl_opts = {
55
+ 'quiet': True,
56
+ 'no_warnings': True,
57
+ 'skip_download': True,
58
+ }
59
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
60
+ try:
61
+ info_dict = ydl.extract_info(url, download=False)
62
+ return info_dict
63
+ except Exception as e:
64
+ logging.error(f"Error extracting video info: {e}")
65
+ return None
66
+
67
+
68
+ def get_youtube(video_url):
69
+ ydl_opts = {
70
+ 'format': 'bestaudio[ext=m4a]',
71
+ 'noplaylist': False,
72
+ 'quiet': True,
73
+ 'extract_flat': True
74
+ }
75
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
76
+ logging.debug("About to extract youtube info")
77
+ info_dict = ydl.extract_info(video_url, download=False)
78
+ logging.debug("Youtube info successfully extracted")
79
+ return info_dict
80
+
81
+
82
+ def get_playlist_videos(playlist_url):
83
+ ydl_opts = {
84
+ 'extract_flat': True,
85
+ 'skip_download': True,
86
+ 'quiet': True
87
+ }
88
+
89
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
90
+ info = ydl.extract_info(playlist_url, download=False)
91
+
92
+ if 'entries' in info:
93
+ video_urls = [entry['url'] for entry in info['entries']]
94
+ playlist_title = info['title']
95
+ return video_urls, playlist_title
96
+ else:
97
+ print("No videos found in the playlist.")
98
+ return [], None
99
+
100
+
101
+ def download_video(video_url, download_path, info_dict, download_video_flag, current_whisper_model):
102
+ global video_file_path, ffmpeg_path
103
+ global audio_file_path
104
+
105
+ # Normalize Video Title name
106
+ logging.debug("About to normalize downloaded video title")
107
+ if 'title' not in info_dict or 'ext' not in info_dict:
108
+ logging.error("info_dict is missing 'title' or 'ext'")
109
+ return None
110
+
111
+ normalized_video_title = normalize_title(info_dict['title'])
112
+
113
+ # FIXME - make sure this works/checks against hte current model
114
+ # Check if media already exists in the database and compare whisper models
115
+ should_download, reason = check_media_and_whisper_model(
116
+ title=normalized_video_title,
117
+ url=video_url,
118
+ current_whisper_model=current_whisper_model
119
+ )
120
+
121
+ if not should_download:
122
+ logging.info(f"Skipping download: {reason}")
123
+ return None
124
+
125
+ logging.info(f"Proceeding with download: {reason}")
126
+
127
+ video_file_path = os.path.join(download_path, f"{normalized_video_title}.{info_dict['ext']}")
128
+
129
+ # Check for existence of video file
130
+ if os.path.exists(video_file_path):
131
+ logging.info(f"Video file already exists: {video_file_path}")
132
+ return video_file_path
133
+
134
+ # Setup path handling for ffmpeg on different OSs
135
+ if sys.platform.startswith('win'):
136
+ ffmpeg_path = os.path.join(os.getcwd(), 'Bin', 'ffmpeg.exe')
137
+ elif sys.platform.startswith('linux'):
138
+ ffmpeg_path = 'ffmpeg'
139
+ elif sys.platform.startswith('darwin'):
140
+ ffmpeg_path = 'ffmpeg'
141
+
142
+ if download_video_flag:
143
+ video_file_path = os.path.join(download_path, f"{normalized_video_title}.mp4")
144
+ ydl_opts_video = {
145
+ 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]',
146
+ 'outtmpl': video_file_path,
147
+ 'ffmpeg_location': ffmpeg_path
148
+ }
149
+
150
+ try:
151
+ with yt_dlp.YoutubeDL(ydl_opts_video) as ydl:
152
+ logging.debug("yt_dlp: About to download video with youtube-dl")
153
+ ydl.download([video_url])
154
+ logging.debug("yt_dlp: Video successfully downloaded with youtube-dl")
155
+ if os.path.exists(video_file_path):
156
+ return video_file_path
157
+ else:
158
+ logging.error("yt_dlp: Video file not found after download")
159
+ return None
160
+ except Exception as e:
161
+ logging.error(f"yt_dlp: Error downloading video: {e}")
162
+ return None
163
+ elif not download_video_flag:
164
+ video_file_path = os.path.join(download_path, f"{normalized_video_title}.mp4")
165
+ # Set options for video and audio
166
+ ydl_opts = {
167
+ 'format': 'bestaudio[ext=m4a]',
168
+ 'quiet': True,
169
+ 'outtmpl': video_file_path
170
+ }
171
+
172
+ try:
173
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
174
+ logging.debug("yt_dlp: About to download video with youtube-dl")
175
+ ydl.download([video_url])
176
+ logging.debug("yt_dlp: Video successfully downloaded with youtube-dl")
177
+ if os.path.exists(video_file_path):
178
+ return video_file_path
179
+ else:
180
+ logging.error("yt_dlp: Video file not found after download")
181
+ return None
182
+ except Exception as e:
183
+ logging.error(f"yt_dlp: Error downloading video: {e}")
184
+ return None
185
+
186
+ else:
187
+ logging.debug("download_video: Download video flag is set to False and video file path is not found")
188
+ return None
189
+
190
+
191
+ def extract_video_info(url):
192
+ try:
193
+ with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
194
+ info = ydl.extract_info(url, download=False)
195
+
196
+ # Log only a subset of the info to avoid overwhelming the logs
197
+ log_info = {
198
+ 'title': info.get('title'),
199
+ 'duration': info.get('duration'),
200
+ 'upload_date': info.get('upload_date')
201
+ }
202
+ logging.debug(f"Extracted info for {url}: {log_info}")
203
+
204
+ return info
205
+ except Exception as e:
206
+ logging.error(f"Error extracting video info for {url}: {str(e)}", exc_info=True)
207
+ return None
208
+
209
+
210
+ def get_youtube_playlist_urls(playlist_id):
211
+ ydl_opts = {
212
+ 'extract_flat': True,
213
+ 'quiet': True,
214
+ }
215
+
216
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
217
+ result = ydl.extract_info(f'https://www.youtube.com/playlist?list={playlist_id}', download=False)
218
+ return [entry['url'] for entry in result['entries'] if entry.get('url')]
219
+
220
+
221
+ def parse_and_expand_urls(urls):
222
+ logging.info(f"Starting parse_and_expand_urls with input: {urls}")
223
+ expanded_urls = []
224
+
225
+ for url in urls:
226
+ try:
227
+ logging.info(f"Processing URL: {url}")
228
+ parsed_url = urlparse(url)
229
+ logging.debug(f"Parsed URL components: {parsed_url}")
230
+
231
+ # YouTube playlist handling
232
+ if 'youtube.com' in parsed_url.netloc and 'list' in parsed_url.query:
233
+ playlist_id = parse_qs(parsed_url.query)['list'][0]
234
+ logging.info(f"Detected YouTube playlist with ID: {playlist_id}")
235
+ playlist_urls = get_youtube_playlist_urls(playlist_id)
236
+ logging.info(f"Expanded playlist URLs: {playlist_urls}")
237
+ expanded_urls.extend(playlist_urls)
238
+
239
+ # YouTube short URL handling
240
+ elif 'youtu.be' in parsed_url.netloc:
241
+ video_id = parsed_url.path.lstrip('/')
242
+ full_url = f'https://www.youtube.com/watch?v={video_id}'
243
+ logging.info(f"Expanded YouTube short URL to: {full_url}")
244
+ expanded_urls.append(full_url)
245
+
246
+ # Vimeo handling
247
+ elif 'vimeo.com' in parsed_url.netloc:
248
+ video_id = parsed_url.path.lstrip('/')
249
+ full_url = f'https://vimeo.com/{video_id}'
250
+ logging.info(f"Processed Vimeo URL: {full_url}")
251
+ expanded_urls.append(full_url)
252
+
253
+ # Add more platform-specific handling here
254
+
255
+ else:
256
+ logging.info(f"URL not recognized as special case, adding as-is: {url}")
257
+ expanded_urls.append(url)
258
+
259
+ except Exception as e:
260
+ logging.error(f"Error processing URL {url}: {str(e)}", exc_info=True)
261
+ # Optionally, you might want to add the problematic URL to expanded_urls
262
+ # expanded_urls.append(url)
263
+
264
+ logging.info(f"Final expanded URLs: {expanded_urls}")
265
+ return expanded_urls
266
+
267
+
268
+ def extract_metadata(url, use_cookies=False, cookies=None):
269
+ ydl_opts = {
270
+ 'quiet': True,
271
+ 'no_warnings': True,
272
+ 'extract_flat': True,
273
+ 'skip_download': True,
274
+ }
275
+
276
+ if use_cookies and cookies:
277
+ try:
278
+ cookie_dict = json.loads(cookies)
279
+ ydl_opts['cookiefile'] = cookie_dict
280
+ except json.JSONDecodeError:
281
+ logging.warning("Invalid cookie format. Proceeding without cookies.")
282
+
283
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
284
+ try:
285
+ info = ydl.extract_info(url, download=False)
286
+ metadata = {
287
+ 'title': info.get('title'),
288
+ 'uploader': info.get('uploader'),
289
+ 'upload_date': info.get('upload_date'),
290
+ 'view_count': info.get('view_count'),
291
+ 'like_count': info.get('like_count'),
292
+ 'duration': info.get('duration'),
293
+ 'tags': info.get('tags'),
294
+ 'description': info.get('description')
295
+ }
296
+
297
+ # Create a safe subset of metadata to log
298
+ safe_metadata = {
299
+ 'title': metadata.get('title', 'No title'),
300
+ 'duration': metadata.get('duration', 'Unknown duration'),
301
+ 'upload_date': metadata.get('upload_date', 'Unknown upload date'),
302
+ 'uploader': metadata.get('uploader', 'Unknown uploader')
303
+ }
304
+
305
+ logging.info(f"Successfully extracted metadata for {url}: {safe_metadata}")
306
+ return metadata
307
+ except Exception as e:
308
+ logging.error(f"Error extracting metadata for {url}: {str(e)}", exc_info=True)
309
+ return None
310
+
311
+
312
+ def generate_timestamped_url(url, hours, minutes, seconds):
313
+ # Extract video ID from the URL
314
+ video_id_match = re.search(r'(?:v=|)([0-9A-Za-z_-]{11}).*', url)
315
+ if not video_id_match:
316
+ return "Invalid YouTube URL"
317
+
318
+ video_id = video_id_match.group(1)
319
+
320
+ # Calculate total seconds
321
+ total_seconds = int(hours) * 3600 + int(minutes) * 60 + int(seconds)
322
+
323
+ # Generate the new URL
324
+ new_url = f"https://www.youtube.com/watch?v={video_id}&t={total_seconds}s"
325
+
326
+ return new_url
327
+
328
+
329
+
330
+ #
331
+ #
332
+ #######################################################################################################################
App_Function_Libraries/Web_Scraping/Article_Extractor_Lib.py ADDED
@@ -0,0 +1,381 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Article_Extractor_Lib.py
2
+ #########################################
3
+ # Article Extraction Library
4
+ # This library is used to handle scraping and extraction of articles from web pages.
5
+ #
6
+ ####################
7
+ # Function List
8
+ #
9
+ # 1. get_page_title(url)
10
+ # 2. get_article_text(url)
11
+ # 3. get_article_title(article_url_arg)
12
+ #
13
+ ####################
14
+ #
15
+ # Import necessary libraries
16
+ import logging
17
+ # 3rd-Party Imports
18
+ import asyncio
19
+ import os
20
+ import tempfile
21
+ from datetime import datetime
22
+ from typing import List, Dict
23
+ from urllib.parse import urljoin, urlparse
24
+ from xml.dom import minidom
25
+ from playwright.async_api import async_playwright
26
+ from bs4 import BeautifulSoup
27
+ import requests
28
+ import trafilatura
29
+ import xml.etree.ElementTree as ET
30
+
31
+
32
+ # Import Local
33
+ #
34
+ #######################################################################################################################
35
+ # Function Definitions
36
+ #
37
+
38
+ def get_page_title(url: str) -> str:
39
+ try:
40
+ response = requests.get(url)
41
+ response.raise_for_status()
42
+ soup = BeautifulSoup(response.text, 'html.parser')
43
+ title_tag = soup.find('title')
44
+ return title_tag.string.strip() if title_tag else "Untitled"
45
+ except requests.RequestException as e:
46
+ logging.error(f"Error fetching page title: {e}")
47
+ return "Untitled"
48
+
49
+
50
+ async def scrape_article(url):
51
+ async def fetch_html(url: str) -> str:
52
+ async with async_playwright() as p:
53
+ browser = await p.chromium.launch(headless=True)
54
+ context = await browser.new_context(
55
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
56
+ page = await context.new_page()
57
+ await page.goto(url)
58
+ await page.wait_for_load_state("networkidle") # Wait for the network to be idle
59
+ content = await page.content()
60
+ await browser.close()
61
+ return content
62
+
63
+ # FIXME - Add option for extracting comments/tables/images
64
+ def extract_article_data(html: str, url: str) -> dict:
65
+ downloaded = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False)
66
+ metadata = trafilatura.extract_metadata(html)
67
+
68
+ result = {
69
+ 'title': 'N/A',
70
+ 'author': 'N/A',
71
+ 'content': '',
72
+ 'date': 'N/A',
73
+ 'url': url,
74
+ 'extraction_successful': False
75
+ }
76
+
77
+ if downloaded:
78
+ result['content'] = downloaded
79
+ result['extraction_successful'] = True
80
+
81
+ if metadata:
82
+ result.update({
83
+ 'title': metadata.title if metadata.title else 'N/A',
84
+ 'author': metadata.author if metadata.author else 'N/A',
85
+ 'date': metadata.date if metadata.date else 'N/A'
86
+ })
87
+ else:
88
+ logging.warning("Metadata extraction failed.")
89
+
90
+ if not downloaded:
91
+ logging.warning("Content extraction failed.")
92
+
93
+ return result
94
+
95
+ def convert_html_to_markdown(html: str) -> str:
96
+ soup = BeautifulSoup(html, 'html.parser')
97
+ for para in soup.find_all('p'):
98
+ # Add a newline at the end of each paragraph for markdown separation
99
+ para.append('\n')
100
+ # Use .get_text() with separator to keep paragraph separation
101
+ return soup.get_text(separator='\n\n')
102
+
103
+ html = await fetch_html(url)
104
+ article_data = extract_article_data(html, url)
105
+ if article_data['extraction_successful']:
106
+ article_data['content'] = convert_html_to_markdown(article_data['content'])
107
+ return article_data
108
+
109
+
110
+ def collect_internal_links(base_url: str) -> set:
111
+ visited = set()
112
+ to_visit = {base_url}
113
+
114
+ while to_visit:
115
+ current_url = to_visit.pop()
116
+ if current_url in visited:
117
+ continue
118
+
119
+ try:
120
+ response = requests.get(current_url)
121
+ response.raise_for_status()
122
+ soup = BeautifulSoup(response.text, 'html.parser')
123
+
124
+ # Collect internal links
125
+ for link in soup.find_all('a', href=True):
126
+ full_url = urljoin(base_url, link['href'])
127
+ # Only process links within the same domain
128
+ if urlparse(full_url).netloc == urlparse(base_url).netloc:
129
+ if full_url not in visited:
130
+ to_visit.add(full_url)
131
+
132
+ visited.add(current_url)
133
+ except requests.RequestException as e:
134
+ logging.error(f"Error visiting {current_url}: {e}")
135
+ continue
136
+
137
+ return visited
138
+
139
+
140
+ def generate_temp_sitemap_from_links(links: set) -> str:
141
+ """
142
+ Generate a temporary sitemap file from collected links and return its path.
143
+
144
+ :param links: A set of URLs to include in the sitemap
145
+ :return: Path to the temporary sitemap file
146
+ """
147
+ # Create the root element
148
+ urlset = ET.Element("urlset")
149
+ urlset.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
150
+
151
+ # Add each link to the sitemap
152
+ for link in links:
153
+ url = ET.SubElement(urlset, "url")
154
+ loc = ET.SubElement(url, "loc")
155
+ loc.text = link
156
+ lastmod = ET.SubElement(url, "lastmod")
157
+ lastmod.text = datetime.now().strftime("%Y-%m-%d")
158
+ changefreq = ET.SubElement(url, "changefreq")
159
+ changefreq.text = "daily"
160
+ priority = ET.SubElement(url, "priority")
161
+ priority.text = "0.5"
162
+
163
+ # Create the tree and get it as a string
164
+ xml_string = ET.tostring(urlset, 'utf-8')
165
+
166
+ # Pretty print the XML
167
+ pretty_xml = minidom.parseString(xml_string).toprettyxml(indent=" ")
168
+
169
+ # Create a temporary file
170
+ with tempfile.NamedTemporaryFile(mode="w", suffix=".xml", delete=False) as temp_file:
171
+ temp_file.write(pretty_xml)
172
+ temp_file_path = temp_file.name
173
+
174
+ logging.info(f"Temporary sitemap created at: {temp_file_path}")
175
+ return temp_file_path
176
+
177
+
178
+ def generate_sitemap_for_url(url: str) -> List[Dict[str, str]]:
179
+ """
180
+ Generate a sitemap for the given URL using the create_filtered_sitemap function.
181
+
182
+ Args:
183
+ url (str): The base URL to generate the sitemap for
184
+
185
+ Returns:
186
+ List[Dict[str, str]]: A list of dictionaries, each containing 'url' and 'title' keys
187
+ """
188
+ with tempfile.NamedTemporaryFile(mode="w+", suffix=".xml", delete=False) as temp_file:
189
+ create_filtered_sitemap(url, temp_file.name, is_content_page)
190
+ temp_file.seek(0)
191
+ tree = ET.parse(temp_file.name)
192
+ root = tree.getroot()
193
+
194
+ sitemap = []
195
+ for url_elem in root.findall(".//{http://www.sitemaps.org/schemas/sitemap/0.9}url"):
196
+ loc = url_elem.find("{http://www.sitemaps.org/schemas/sitemap/0.9}loc").text
197
+ sitemap.append({"url": loc, "title": loc.split("/")[-1] or url}) # Use the last part of the URL as a title
198
+
199
+ return sitemap
200
+
201
+ async def scrape_entire_site(base_url: str) -> List[Dict]:
202
+ """
203
+ Scrape the entire site by generating a temporary sitemap and extracting content from each page.
204
+
205
+ :param base_url: The base URL of the site to scrape
206
+ :return: A list of dictionaries containing scraped article data
207
+ """
208
+ # Step 1: Collect internal links from the site
209
+ links = collect_internal_links(base_url)
210
+ logging.info(f"Collected {len(links)} internal links.")
211
+
212
+ # Step 2: Generate the temporary sitemap
213
+ temp_sitemap_path = generate_temp_sitemap_from_links(links)
214
+
215
+ # Step 3: Scrape each URL in the sitemap
216
+ scraped_articles = []
217
+ try:
218
+ async def scrape_and_log(link):
219
+ logging.info(f"Scraping {link} ...")
220
+ article_data = await scrape_article(link)
221
+
222
+ if article_data:
223
+ logging.info(f"Title: {article_data['title']}")
224
+ logging.info(f"Author: {article_data['author']}")
225
+ logging.info(f"Date: {article_data['date']}")
226
+ logging.info(f"Content: {article_data['content'][:500]}...")
227
+
228
+ return article_data
229
+ return None
230
+
231
+ # Use asyncio.gather to scrape multiple articles concurrently
232
+ scraped_articles = await asyncio.gather(*[scrape_and_log(link) for link in links])
233
+ # Remove any None values (failed scrapes)
234
+ scraped_articles = [article for article in scraped_articles if article is not None]
235
+
236
+ finally:
237
+ # Clean up the temporary sitemap file
238
+ os.unlink(temp_sitemap_path)
239
+ logging.info("Temporary sitemap file deleted")
240
+
241
+ return scraped_articles
242
+
243
+
244
+ def scrape_by_url_level(base_url: str, level: int) -> list:
245
+ """Scrape articles from URLs up to a certain level under the base URL."""
246
+
247
+ def get_url_level(url: str) -> int:
248
+ return len(urlparse(url).path.strip('/').split('/'))
249
+
250
+ links = collect_internal_links(base_url)
251
+ filtered_links = [link for link in links if get_url_level(link) <= level]
252
+
253
+ return [article for link in filtered_links if (article := scrape_article(link))]
254
+
255
+
256
+ def scrape_from_sitemap(sitemap_url: str) -> list:
257
+ """Scrape articles from a sitemap URL."""
258
+ try:
259
+ response = requests.get(sitemap_url)
260
+ response.raise_for_status()
261
+ root = ET.fromstring(response.content)
262
+
263
+ return [article for url in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
264
+ if (article := scrape_article(url.text))]
265
+ except requests.RequestException as e:
266
+ logging.error(f"Error fetching sitemap: {e}")
267
+ return []
268
+
269
+
270
+ def convert_to_markdown(articles: list) -> str:
271
+ """Convert a list of article data into a single markdown document."""
272
+ markdown = ""
273
+ for article in articles:
274
+ markdown += f"# {article['title']}\n\n"
275
+ markdown += f"Author: {article['author']}\n"
276
+ markdown += f"Date: {article['date']}\n\n"
277
+ markdown += f"{article['content']}\n\n"
278
+ markdown += "---\n\n" # Separator between articles
279
+ return markdown
280
+
281
+
282
+ def is_content_page(url: str) -> bool:
283
+ """
284
+ Determine if a URL is likely to be a content page.
285
+ This is a basic implementation and may need to be adjusted based on the specific website structure.
286
+
287
+ :param url: The URL to check
288
+ :return: True if the URL is likely a content page, False otherwise
289
+ """
290
+ #Add more specific checks here based on the website's structure
291
+ # Exclude common non-content pages
292
+ exclude_patterns = [
293
+ '/tag/', '/category/', '/author/', '/search/', '/page/',
294
+ 'wp-content', 'wp-includes', 'wp-json', 'wp-admin',
295
+ 'login', 'register', 'cart', 'checkout', 'account',
296
+ '.jpg', '.png', '.gif', '.pdf', '.zip'
297
+ ]
298
+ return not any(pattern in url.lower() for pattern in exclude_patterns)
299
+
300
+
301
+ def create_filtered_sitemap(base_url: str, output_file: str, filter_function):
302
+ """
303
+ Create a sitemap from internal links and filter them based on a custom function.
304
+
305
+ :param base_url: The base URL of the website
306
+ :param output_file: The file to save the sitemap to
307
+ :param filter_function: A function that takes a URL and returns True if it should be included
308
+ """
309
+ links = collect_internal_links(base_url)
310
+ filtered_links = set(filter(filter_function, links))
311
+
312
+ root = ET.Element("urlset")
313
+ root.set("xmlns", "http://www.sitemaps.org/schemas/sitemap/0.9")
314
+
315
+ for link in filtered_links:
316
+ url = ET.SubElement(root, "url")
317
+ loc = ET.SubElement(url, "loc")
318
+ loc.text = link
319
+
320
+ tree = ET.ElementTree(root)
321
+ tree.write(output_file, encoding='utf-8', xml_declaration=True)
322
+ print(f"Filtered sitemap saved to {output_file}")
323
+
324
+
325
+ def scrape_from_filtered_sitemap(sitemap_file: str, filter_function) -> list:
326
+ """
327
+ Scrape articles from a sitemap file, applying an additional filter function.
328
+
329
+ :param sitemap_file: Path to the sitemap file
330
+ :param filter_function: A function that takes a URL and returns True if it should be scraped
331
+ :return: List of scraped articles
332
+ """
333
+ try:
334
+ tree = ET.parse(sitemap_file)
335
+ root = tree.getroot()
336
+
337
+ articles = []
338
+ for url in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}loc'):
339
+ if filter_function(url.text):
340
+ article_data = scrape_article(url.text)
341
+ if article_data:
342
+ articles.append(article_data)
343
+
344
+ return articles
345
+ except ET.ParseError as e:
346
+ logging.error(f"Error parsing sitemap: {e}")
347
+ return []
348
+
349
+
350
+ def scrape_and_convert_with_filter(source: str, output_file: str, filter_function=is_content_page, level: int = None):
351
+ """
352
+ Scrape articles from a sitemap or by URL level, apply filtering, and convert to a single markdown file.
353
+
354
+ :param source: URL of the sitemap, base URL for level-based scraping, or path to a local sitemap file
355
+ :param output_file: Path to save the output markdown file
356
+ :param filter_function: Function to filter URLs (default is is_content_page)
357
+ :param level: URL level for scraping (None if using sitemap)
358
+ """
359
+ if level is not None:
360
+ # Scraping by URL level
361
+ articles = scrape_by_url_level(source, level)
362
+ articles = [article for article in articles if filter_function(article['url'])]
363
+ elif source.startswith('http'):
364
+ # Scraping from online sitemap
365
+ articles = scrape_from_sitemap(source)
366
+ articles = [article for article in articles if filter_function(article['url'])]
367
+ else:
368
+ # Scraping from local sitemap file
369
+ articles = scrape_from_filtered_sitemap(source, filter_function)
370
+
371
+ articles = [article for article in articles if filter_function(article['url'])]
372
+ markdown_content = convert_to_markdown(articles)
373
+
374
+ with open(output_file, 'w', encoding='utf-8') as f:
375
+ f.write(markdown_content)
376
+
377
+ logging.info(f"Scraped and filtered content saved to {output_file}")
378
+
379
+ #
380
+ #
381
+ #######################################################################################################################
App_Function_Libraries/Web_Scraping/Article_Summarization_Lib.py ADDED
@@ -0,0 +1,249 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Article_Summarization_Lib.py
2
+ #########################################
3
+ # Article Summarization Library
4
+ # This library is used to handle summarization of articles.
5
+ import asyncio
6
+ # FIXME - this library should be refactored into `Article_Extractor_Lib` and then renamed to `Web_Scraping_Lib`
7
+
8
+ #
9
+ ####
10
+ #
11
+ ####################
12
+ # Function List
13
+ #
14
+ # 1.
15
+ #
16
+ ####################
17
+ #
18
+ # Import necessary libraries
19
+ import datetime
20
+ from datetime import datetime
21
+ import gradio as gr
22
+ import json
23
+ import os
24
+ import logging
25
+ import requests
26
+ # 3rd-Party Imports
27
+ #
28
+ # Local Imports
29
+ from App_Function_Libraries.Utils.Utils import sanitize_filename
30
+ from App_Function_Libraries.Web_Scraping.Article_Extractor_Lib import scrape_article
31
+ from App_Function_Libraries.Summarization.Local_Summarization_Lib import summarize_with_llama, summarize_with_oobabooga, summarize_with_tabbyapi, \
32
+ summarize_with_vllm, summarize_with_kobold, save_summary_to_file, summarize_with_local_llm
33
+ from App_Function_Libraries.Summarization.Summarization_General_Lib import summarize_with_openai, summarize_with_anthropic, summarize_with_cohere, \
34
+ summarize_with_groq, summarize_with_openrouter, summarize_with_deepseek, summarize_with_huggingface, \
35
+ summarize_with_mistral
36
+ from App_Function_Libraries.DB.DB_Manager import ingest_article_to_db
37
+ #
38
+ #######################################################################################################################
39
+ # Function Definitions
40
+ #
41
+
42
+ async def scrape_and_summarize_multiple(urls, custom_prompt_arg, api_name, api_key, keywords, custom_article_titles, system_message=None):
43
+ urls = [url.strip() for url in urls.split('\n') if url.strip()]
44
+ custom_titles = custom_article_titles.split('\n') if custom_article_titles else []
45
+
46
+ results = []
47
+ errors = []
48
+
49
+ # Create a progress bar
50
+ progress = gr.Progress()
51
+
52
+ # FIXME - add progress tracking to the gradio UI
53
+ for i, url in enumerate(urls):
54
+ custom_title = custom_titles[i] if i < len(custom_titles) else None
55
+ try:
56
+ article = await scrape_article(url)
57
+ if article and article['extraction_successful']:
58
+ if custom_title:
59
+ article['title'] = custom_title
60
+ results.append(article)
61
+ except Exception as e:
62
+ error_message = f"Error processing URL {i + 1} ({url}): {str(e)}"
63
+ errors.append(error_message)
64
+
65
+ # Update progress
66
+ progress((i + 1) / len(urls), desc=f"Processed {i + 1}/{len(urls)} URLs")
67
+
68
+ if errors:
69
+ logging.error("\n".join(errors))
70
+
71
+ return results
72
+
73
+
74
+
75
+ def scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, custom_article_title, system_message=None):
76
+ try:
77
+ # Step 1: Scrape the article
78
+ article_data = asyncio.run(scrape_article(url))
79
+ print(f"Scraped Article Data: {article_data}") # Debugging statement
80
+ if not article_data:
81
+ return "Failed to scrape the article."
82
+
83
+ # Use the custom title if provided, otherwise use the scraped title
84
+ title = custom_article_title.strip() if custom_article_title else article_data.get('title', 'Untitled')
85
+ author = article_data.get('author', 'Unknown')
86
+ content = article_data.get('content', '')
87
+ ingestion_date = datetime.now().strftime('%Y-%m-%d')
88
+
89
+ print(f"Title: {title}, Author: {author}, Content Length: {len(content)}") # Debugging statement
90
+
91
+ # Custom system prompt for the article
92
+ system_message = system_message or "Act as a professional summarizer and summarize this article."
93
+ # Custom prompt for the article
94
+ article_custom_prompt = custom_prompt_arg or "Act as a professional summarizer and summarize this article."
95
+
96
+ # Step 2: Summarize the article
97
+ summary = None
98
+ if api_name:
99
+ logging.debug(f"Article_Summarizer: Summarization being performed by {api_name}")
100
+
101
+ # Sanitize filename for saving the JSON file
102
+ sanitized_title = sanitize_filename(title)
103
+ json_file_path = os.path.join("Results", f"{sanitized_title}_segments.json")
104
+
105
+ with open(json_file_path, 'w') as json_file:
106
+ json.dump([{'text': content}], json_file, indent=2)
107
+
108
+ try:
109
+ if api_name.lower() == 'openai':
110
+ # def summarize_with_openai(api_key, input_data, custom_prompt_arg)
111
+ summary = summarize_with_openai(api_key, json_file_path, article_custom_prompt, system_message)
112
+
113
+ elif api_name.lower() == "anthropic":
114
+ # def summarize_with_anthropic(api_key, input_data, model, custom_prompt_arg, max_retries=3, retry_delay=5):
115
+ summary = summarize_with_anthropic(api_key, json_file_path, article_custom_prompt, system_message)
116
+ elif api_name.lower() == "cohere":
117
+ # def summarize_with_cohere(api_key, input_data, model, custom_prompt_arg)
118
+ summary = summarize_with_cohere(api_key, json_file_path, article_custom_prompt, system_message)
119
+
120
+ elif api_name.lower() == "groq":
121
+ logging.debug(f"MAIN: Trying to summarize with groq")
122
+ # def summarize_with_groq(api_key, input_data, model, custom_prompt_arg):
123
+ summary = summarize_with_groq(api_key, json_file_path, article_custom_prompt, system_message)
124
+
125
+ elif api_name.lower() == "openrouter":
126
+ logging.debug(f"MAIN: Trying to summarize with OpenRouter")
127
+ # def summarize_with_openrouter(api_key, input_data, custom_prompt_arg):
128
+ summary = summarize_with_openrouter(api_key, json_file_path, article_custom_prompt, system_message)
129
+
130
+ elif api_name.lower() == "deepseek":
131
+ logging.debug(f"MAIN: Trying to summarize with DeepSeek")
132
+ # def summarize_with_deepseek(api_key, input_data, custom_prompt_arg):
133
+ summary = summarize_with_deepseek(api_key, json_file_path, article_custom_prompt, system_message)
134
+
135
+ elif api_name.lower() == "mistral":
136
+ summary = summarize_with_mistral(api_key, json_file_path, article_custom_prompt, system_message)
137
+
138
+ elif api_name.lower() == "llama.cpp":
139
+ logging.debug(f"MAIN: Trying to summarize with Llama.cpp")
140
+ # def summarize_with_llama(api_url, file_path, token, custom_prompt)
141
+ summary = summarize_with_llama(json_file_path, article_custom_prompt, system_message)
142
+
143
+ elif api_name.lower() == "kobold":
144
+ logging.debug(f"MAIN: Trying to summarize with Kobold.cpp")
145
+ # def summarize_with_kobold(input_data, kobold_api_token, custom_prompt_input, api_url):
146
+ summary = summarize_with_kobold(json_file_path, api_key, article_custom_prompt, system_message)
147
+
148
+ elif api_name.lower() == "ooba":
149
+ # def summarize_with_oobabooga(input_data, api_key, custom_prompt, api_url):
150
+ summary = summarize_with_oobabooga(json_file_path, api_key, article_custom_prompt, system_message)
151
+
152
+ elif api_name.lower() == "tabbyapi":
153
+ # def summarize_with_tabbyapi(input_data, tabby_model, custom_prompt_input, api_key=None, api_IP):
154
+ summary = summarize_with_tabbyapi(json_file_path, article_custom_prompt, system_message)
155
+
156
+ elif api_name.lower() == "vllm":
157
+ logging.debug(f"MAIN: Trying to summarize with VLLM")
158
+ # def summarize_with_vllm(api_key, input_data, custom_prompt_input):
159
+ summary = summarize_with_vllm(json_file_path, article_custom_prompt, system_message)
160
+
161
+ elif api_name.lower() == "local-llm":
162
+ logging.debug(f"MAIN: Trying to summarize with Local LLM")
163
+ summary = summarize_with_local_llm(json_file_path, article_custom_prompt, system_message)
164
+
165
+ elif api_name.lower() == "huggingface":
166
+ logging.debug(f"MAIN: Trying to summarize with huggingface")
167
+ # def summarize_with_huggingface(api_key, input_data, custom_prompt_arg):
168
+ summarize_with_huggingface(api_key, json_file_path, article_custom_prompt, system_message)
169
+ # Add additional API handlers here...
170
+
171
+ except requests.exceptions.ConnectionError as e:
172
+ logging.error(f"Connection error while trying to summarize with {api_name}: {str(e)}")
173
+
174
+ if summary:
175
+ logging.info(f"Article_Summarizer: Summary generated using {api_name} API")
176
+ save_summary_to_file(summary, json_file_path)
177
+ else:
178
+ summary = "Summary not available"
179
+ logging.warning(f"Failed to generate summary using {api_name} API")
180
+
181
+ else:
182
+ summary = "Article Summarization: No API provided for summarization."
183
+
184
+ print(f"Summary: {summary}") # Debugging statement
185
+
186
+ # Step 3: Ingest the article into the database
187
+ ingestion_result = ingest_article_to_db(url, title, author, content, keywords, summary, ingestion_date,
188
+ article_custom_prompt)
189
+
190
+ return f"Title: {title}\nAuthor: {author}\nIngestion Result: {ingestion_result}\n\nSummary: {summary}\n\nArticle Contents: {content}"
191
+ except Exception as e:
192
+ logging.error(f"Error processing URL {url}: {str(e)}")
193
+ return f"Failed to process URL {url}: {str(e)}"
194
+
195
+
196
+ def scrape_and_no_summarize_then_ingest(url, keywords, custom_article_title):
197
+ try:
198
+ # Step 1: Scrape the article
199
+ article_data = asyncio.run(scrape_article(url))
200
+ print(f"Scraped Article Data: {article_data}") # Debugging statement
201
+ if not article_data:
202
+ return "Failed to scrape the article."
203
+
204
+ # Use the custom title if provided, otherwise use the scraped title
205
+ title = custom_article_title.strip() if custom_article_title else article_data.get('title', 'Untitled')
206
+ author = article_data.get('author', 'Unknown')
207
+ content = article_data.get('content', '')
208
+ ingestion_date = datetime.now().strftime('%Y-%m-%d')
209
+
210
+ print(f"Title: {title}, Author: {author}, Content Length: {len(content)}") # Debugging statement
211
+
212
+ # Step 2: Ingest the article into the database
213
+ ingestion_result = ingest_article_to_db(url, title, author, content, keywords, ingestion_date, None, None)
214
+
215
+ return f"Title: {title}\nAuthor: {author}\nIngestion Result: {ingestion_result}\n\nArticle Contents: {content}"
216
+ except Exception as e:
217
+ logging.error(f"Error processing URL {url}: {str(e)}")
218
+ return f"Failed to process URL {url}: {str(e)}"
219
+
220
+
221
+ def ingest_unstructured_text(text, custom_prompt, api_name, api_key, keywords, custom_article_title, system_message=None):
222
+ title = custom_article_title.strip() if custom_article_title else "Unstructured Text"
223
+ author = "Unknown"
224
+ ingestion_date = datetime.now().strftime('%Y-%m-%d')
225
+
226
+ # Summarize the unstructured text
227
+ if api_name:
228
+ json_file_path = f"Results/{title.replace(' ', '_')}_segments.json"
229
+ with open(json_file_path, 'w') as json_file:
230
+ json.dump([{'text': text}], json_file, indent=2)
231
+
232
+ if api_name.lower() == 'openai':
233
+ summary = summarize_with_openai(api_key, json_file_path, custom_prompt, system_message)
234
+ # Add other APIs as needed
235
+ else:
236
+ summary = "Unsupported API."
237
+ else:
238
+ summary = "No API provided for summarization."
239
+
240
+ # Ingest the unstructured text into the database
241
+ ingestion_result = ingest_article_to_db('Unstructured Text', title, author, text, keywords, summary, ingestion_date,
242
+ custom_prompt)
243
+ return f"Title: {title}\nSummary: {summary}\nIngestion Result: {ingestion_result}"
244
+
245
+
246
+
247
+ #
248
+ #
249
+ #######################################################################################################################
App_Function_Libraries/Web_Scraping/__init__.py ADDED
File without changes