oceansweep commited on
Commit
66a2900
1 Parent(s): 0215147

Upload 2 files

Browse files
App_Function_Libraries/DB/DB_Manager.py CHANGED
@@ -5,16 +5,14 @@
5
  import configparser
6
  import os
7
  import logging
8
- import threading
9
- from contextlib import contextmanager
10
  from typing import Tuple, List, Union, Dict
11
- import sqlite3
12
  import time
13
  #
14
  # 3rd-Party Libraries
15
  from elasticsearch import Elasticsearch
16
  #
17
  # Import your existing SQLite functions
 
18
  from App_Function_Libraries.DB.SQLite_DB import (
19
  update_media_content as sqlite_update_media_content,
20
  list_prompts as sqlite_list_prompts,
@@ -49,166 +47,93 @@ from App_Function_Libraries.DB.SQLite_DB import (
49
  search_and_display_items as sqlite_search_and_display_items,
50
  get_conversation_name as sqlite_get_conversation_name,
51
  add_media_with_keywords as sqlite_add_media_with_keywords,
52
- check_media_and_whisper_model as sqlite_check_media_and_whisper_model,
53
- DatabaseError, create_document_version as sqlite_create_document_version,
54
- get_document_version as sqlite_get_document_version, sqlite_search_db, sqlite_add_media_chunk,
55
  sqlite_update_fts_for_media, sqlite_get_unprocessed_media, fetch_item_details as sqlite_fetch_item_details, \
56
  search_media_database as sqlite_search_media_database, mark_as_trash as sqlite_mark_as_trash, \
57
  get_media_transcripts as sqlite_get_media_transcripts, get_specific_transcript as sqlite_get_specific_transcript, \
58
  get_media_summaries as sqlite_get_media_summaries, get_specific_summary as sqlite_get_specific_summary, \
59
  get_media_prompts as sqlite_get_media_prompts, get_specific_prompt as sqlite_get_specific_prompt, \
60
- delete_specific_transcript as sqlite_delete_specific_transcript, delete_specific_summary as sqlite_delete_specific_summary, \
61
- delete_specific_prompt as sqlite_delete_specific_prompt, fetch_keywords_for_media as sqlite_fetch_keywords_for_media, \
 
 
62
  update_keywords_for_media as sqlite_update_keywords_for_media, check_media_exists as sqlite_check_media_exists, \
63
  search_prompts as sqlite_search_prompts, get_media_content as sqlite_get_media_content, \
64
  get_paginated_files as sqlite_get_paginated_files, get_media_title as sqlite_get_media_title, \
65
  get_all_content_from_database as sqlite_get_all_content_from_database,
 
 
66
  )
67
  #
68
  # Local Imports
69
  from App_Function_Libraries.Utils.Utils import load_comprehensive_config, get_database_path, get_project_relative_path
70
  #
71
  # End of imports
 
 
 
72
  ############################################################################################################
73
  #
74
- # Globals
 
 
75
 
76
- # Load configuration from config file
77
  config_path = get_project_relative_path('Config_Files/config.txt')
78
  config = configparser.ConfigParser()
79
  config.read(config_path)
80
 
81
  db_path: str = config.get('Database', 'sqlite_path', fallback='./Databases/media_summary.db')
82
-
83
  backup_path: str = config.get('Database', 'backup_path', fallback='database_backups')
84
-
85
  backup_dir: Union[str, bytes] = os.environ.get('DB_BACKUP_DIR', backup_path)
86
 
87
- #
88
- # End of Globals
89
- ############################################################################################################
90
- #
91
- # Database Manager Class
92
 
93
- logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s')
94
- logger = logging.getLogger(__name__)
 
95
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
- class Database:
98
- def __init__(self, db_name='media_summary.db'):
99
- self.db_path = get_database_path(db_name)
100
- self.pool = []
101
- self.pool_size = 10
102
- self.lock = threading.Lock()
103
- self.timeout = 60.0 # 60 seconds timeout
104
-
105
- @contextmanager
106
- def get_connection(self):
107
- retry_count = 5
108
- retry_delay = 1
109
- while retry_count > 0:
110
- try:
111
- if self.pool:
112
- conn = self.pool.pop()
113
- else:
114
- conn = sqlite3.connect(self.db_path, timeout=self.timeout, check_same_thread=False)
115
- conn.execute("PRAGMA journal_mode=WAL;") # Enable WAL mode
116
- yield conn
117
- self.pool.append(conn)
118
- return
119
- except sqlite3.OperationalError as e:
120
- if 'database is locked' in str(e):
121
- logger.warning(f"Database is locked, retrying in {retry_delay} seconds...")
122
- retry_count -= 1
123
- time.sleep(retry_delay)
124
- retry_delay *= 2 # Exponential backoff
125
- else:
126
- raise DatabaseError(f"Database error: {e}")
127
- except Exception as e:
128
- raise DatabaseError(f"Unexpected error: {e}")
129
- raise DatabaseError("Database is locked and retries have been exhausted")
130
-
131
- def execute_query(self, query: str, params: Tuple = ()) -> None:
132
- with self.lock: # Use a global lock for write operations
133
- with self.get_connection() as conn:
134
- try:
135
- cursor = conn.cursor()
136
- cursor.execute(query, params)
137
- conn.commit()
138
- except sqlite3.Error as e:
139
- logger.error(f"Database error: {e}, Query: {query}")
140
- raise DatabaseError(f"Database error: {e}, Query: {query}")
141
-
142
- def execute_many(self, query: str, params_list: List[Tuple]) -> None:
143
- with self.lock: # Use a global lock for write operations
144
- with self.get_connection() as conn:
145
- try:
146
- cursor = conn.cursor()
147
- cursor.executemany(query, params_list)
148
- conn.commit()
149
- except sqlite3.Error as e:
150
- logger.error(f"Database error: {e}, Query: {query}")
151
- raise DatabaseError(f"Database error: {e}, Query: {query}")
152
-
153
- def close_all_connections(self):
154
- for conn in self.pool:
155
- conn.close()
156
- self.pool.clear()
157
 
 
 
 
 
 
158
 
159
- #
160
- # class Database:
161
- # def __init__(self, db_name='media_summary.db'):
162
- # self.db_path = get_database_path(db_name)
163
- # self.pool = []
164
- # self.pool_size = 10
165
- #
166
- # @contextmanager
167
- # def get_connection(self):
168
- # retry_count = 5
169
- # retry_delay = 1
170
- # while retry_count > 0:
171
- # try:
172
- # if self.pool:
173
- # conn = self.pool.pop()
174
- # else:
175
- # conn = sqlite3.connect(self.db_path, check_same_thread=False)
176
- # yield conn
177
- # self.pool.append(conn)
178
- # return
179
- # except sqlite3.OperationalError as e:
180
- # if 'database is locked' in str(e):
181
- # logger.warning(f"Database is locked, retrying in {retry_delay} seconds...")
182
- # retry_count -= 1
183
- # time.sleep(retry_delay)
184
- # retry_delay *= 2 # Exponential backoff
185
- # else:
186
- # raise DatabaseError(f"Database error: {e}")
187
- # except Exception as e:
188
- # raise DatabaseError(f"Unexpected error: {e}")
189
- # raise DatabaseError("Database is locked and retries have been exhausted")
190
- #
191
- # def execute_query(self, query: str, params: Tuple = ()) -> None:
192
- # with self.get_connection() as conn:
193
- # try:
194
- # cursor = conn.cursor()
195
- # cursor.execute(query, params)
196
- # conn.commit()
197
- # except sqlite3.Error as e:
198
- # logger.error(f"Database error: {e}, Query: {query}")
199
- # raise DatabaseError(f"Database error: {e}, Query: {query}")
200
- #
201
- # def close_all_connections(self):
202
- # for conn in self.pool:
203
- # conn.close()
204
- # self.pool.clear()
205
- #
206
 
207
- #
208
- # End of Database Manager Class
209
- ############################################################################################################
210
- #
211
- # Database Config loading
 
 
 
212
 
213
  def get_db_config():
214
  try:
@@ -335,6 +260,12 @@ def get_media_title(*args, **kwargs):
335
  # Implement Elasticsearch version
336
  raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
337
 
 
 
 
 
 
 
338
 
339
  #
340
  # End of DB-Searching functions
@@ -441,9 +372,18 @@ def ingest_article_to_db(url, title, author, content, keywords, summary, ingesti
441
  raise ValueError(f"Unsupported database type: {db_type}")
442
 
443
 
444
- def add_media_chunk(media_id: int, chunk_text: str, start_index: int, end_index: int, chunk_id: str):
445
  if db_type == 'sqlite':
446
- sqlite_add_media_chunk(db, media_id, chunk_text, start_index, end_index, chunk_id)
 
 
 
 
 
 
 
 
 
447
  elif db_type == 'elasticsearch':
448
  # Implement Elasticsearch version
449
  raise NotImplementedError("Elasticsearch version not yet implemented")
@@ -850,16 +790,9 @@ def get_document_version(*args, **kwargs):
850
  # End of Document Versioning Functions
851
  ############################################################################################################
852
 
853
-
854
-
855
- ############################################################################################################
856
- #
857
- # Function to close the database connection for SQLite
858
-
859
  def close_connection():
860
  if db_type == 'sqlite':
861
- db.close_all_connections()
862
- # Elasticsearch doesn't need explicit closing
863
 
864
  #
865
  # End of file
 
5
  import configparser
6
  import os
7
  import logging
 
 
8
  from typing import Tuple, List, Union, Dict
 
9
  import time
10
  #
11
  # 3rd-Party Libraries
12
  from elasticsearch import Elasticsearch
13
  #
14
  # Import your existing SQLite functions
15
+ from App_Function_Libraries.DB.SQLite_DB import DatabaseError
16
  from App_Function_Libraries.DB.SQLite_DB import (
17
  update_media_content as sqlite_update_media_content,
18
  list_prompts as sqlite_list_prompts,
 
47
  search_and_display_items as sqlite_search_and_display_items,
48
  get_conversation_name as sqlite_get_conversation_name,
49
  add_media_with_keywords as sqlite_add_media_with_keywords,
50
+ check_media_and_whisper_model as sqlite_check_media_and_whisper_model, \
51
+ create_document_version as sqlite_create_document_version,
52
+ get_document_version as sqlite_get_document_version, sqlite_search_db, add_media_chunk as sqlite_add_media_chunk,
53
  sqlite_update_fts_for_media, sqlite_get_unprocessed_media, fetch_item_details as sqlite_fetch_item_details, \
54
  search_media_database as sqlite_search_media_database, mark_as_trash as sqlite_mark_as_trash, \
55
  get_media_transcripts as sqlite_get_media_transcripts, get_specific_transcript as sqlite_get_specific_transcript, \
56
  get_media_summaries as sqlite_get_media_summaries, get_specific_summary as sqlite_get_specific_summary, \
57
  get_media_prompts as sqlite_get_media_prompts, get_specific_prompt as sqlite_get_specific_prompt, \
58
+ delete_specific_transcript as sqlite_delete_specific_transcript,
59
+ delete_specific_summary as sqlite_delete_specific_summary, \
60
+ delete_specific_prompt as sqlite_delete_specific_prompt,
61
+ fetch_keywords_for_media as sqlite_fetch_keywords_for_media, \
62
  update_keywords_for_media as sqlite_update_keywords_for_media, check_media_exists as sqlite_check_media_exists, \
63
  search_prompts as sqlite_search_prompts, get_media_content as sqlite_get_media_content, \
64
  get_paginated_files as sqlite_get_paginated_files, get_media_title as sqlite_get_media_title, \
65
  get_all_content_from_database as sqlite_get_all_content_from_database,
66
+ get_next_media_id as sqlite_get_next_media_id, \
67
+ batch_insert_chunks as sqlite_batch_insert_chunks, Database,
68
  )
69
  #
70
  # Local Imports
71
  from App_Function_Libraries.Utils.Utils import load_comprehensive_config, get_database_path, get_project_relative_path
72
  #
73
  # End of imports
74
+ ############################################################################################################
75
+
76
+
77
  ############################################################################################################
78
  #
79
+ # Database Config loading
80
+
81
+ logger = logging.getLogger(__name__)
82
 
 
83
  config_path = get_project_relative_path('Config_Files/config.txt')
84
  config = configparser.ConfigParser()
85
  config.read(config_path)
86
 
87
  db_path: str = config.get('Database', 'sqlite_path', fallback='./Databases/media_summary.db')
 
88
  backup_path: str = config.get('Database', 'backup_path', fallback='database_backups')
 
89
  backup_dir: Union[str, bytes] = os.environ.get('DB_BACKUP_DIR', backup_path)
90
 
91
+ def get_db_config():
92
+ try:
93
+ config = load_comprehensive_config()
 
 
94
 
95
+ if 'Database' not in config:
96
+ print("Warning: 'Database' section not found in config. Using default values.")
97
+ return default_db_config()
98
 
99
+ return {
100
+ 'type': config.get('Database', 'type', fallback='sqlite'),
101
+ 'sqlite_path': config.get('Database', 'sqlite_path', fallback='Databases/media_summary.db'),
102
+ 'elasticsearch_host': config.get('Database', 'elasticsearch_host', fallback='localhost'),
103
+ 'elasticsearch_port': config.getint('Database', 'elasticsearch_port', fallback=9200)
104
+ }
105
+ except FileNotFoundError:
106
+ print("Warning: Config file not found. Using default database configuration.")
107
+ return default_db_config()
108
+ except Exception as e:
109
+ print(f"Error reading config: {str(e)}. Using default database configuration.")
110
+ return default_db_config()
111
 
112
+ def default_db_config():
113
+ return {
114
+ 'type': 'sqlite',
115
+ 'sqlite_path': get_database_path('media_summary.db'),
116
+ 'elasticsearch_host': 'localhost',
117
+ 'elasticsearch_port': 9200
118
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
 
120
+ def ensure_directory_exists(file_path):
121
+ directory = os.path.dirname(file_path)
122
+ if not os.path.exists(directory):
123
+ os.makedirs(directory)
124
+ print(f"Created directory: {directory}")
125
 
126
+ db_config = get_db_config()
127
+ db_type = db_config['type']
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
128
 
129
+ if db_type == 'sqlite':
130
+ db = Database(os.path.basename(db_config['sqlite_path']))
131
+ elif db_type == 'elasticsearch':
132
+ raise NotImplementedError("Elasticsearch support not yet implemented")
133
+ else:
134
+ raise ValueError(f"Unsupported database type: {db_type}")
135
+
136
+ print(f"Database path: {db.db_path}")
137
 
138
  def get_db_config():
139
  try:
 
260
  # Implement Elasticsearch version
261
  raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
262
 
263
+ def get_next_media_id():
264
+ if db_type == 'sqlite':
265
+ return sqlite_get_next_media_id()
266
+ elif db_type == 'elasticsearch':
267
+ # Implement Elasticsearch version
268
+ raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
269
 
270
  #
271
  # End of DB-Searching functions
 
372
  raise ValueError(f"Unsupported database type: {db_type}")
373
 
374
 
375
+ def add_media_chunk(*args, **kwargs):
376
  if db_type == 'sqlite':
377
+ sqlite_add_media_chunk(*args, **kwargs)
378
+ elif db_type == 'elasticsearch':
379
+ # Implement Elasticsearch version
380
+ raise NotImplementedError("Elasticsearch version not yet implemented")
381
+ else:
382
+ raise ValueError(f"Unsupported database type: {db_type}")
383
+
384
+ def batch_insert_chunks(*args, **kwargs):
385
+ if db_type == 'sqlite':
386
+ sqlite_batch_insert_chunks(*args, **kwargs)
387
  elif db_type == 'elasticsearch':
388
  # Implement Elasticsearch version
389
  raise NotImplementedError("Elasticsearch version not yet implemented")
 
790
  # End of Document Versioning Functions
791
  ############################################################################################################
792
 
 
 
 
 
 
 
793
  def close_connection():
794
  if db_type == 'sqlite':
795
+ db.get_connection().close()
 
796
 
797
  #
798
  # End of file
App_Function_Libraries/DB/SQLite_DB.py CHANGED
@@ -49,24 +49,22 @@ import csv
49
  import html
50
  import logging
51
  import os
 
52
  import re
53
  import shutil
54
  import sqlite3
55
- import time
56
  import traceback
57
- from contextlib import contextmanager
58
  from datetime import datetime, timedelta
59
- from typing import List, Tuple, Dict, Any
60
  # Local Libraries
61
- from App_Function_Libraries.Utils.Utils import is_valid_url, get_project_relative_path, get_database_path, \
62
- get_database_dir, ensure_directory_exists
 
 
63
  # Third-Party Libraries
64
  import gradio as gr
65
  import pandas as pd
66
  import yaml
67
-
68
-
69
- # Import Local Libraries
70
  #
71
  #######################################################################################################################
72
  # Function Definitions
@@ -78,8 +76,6 @@ def ensure_database_directory():
78
  ensure_database_directory()
79
 
80
  # Set up logging
81
- #logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
82
- #logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
83
  logger = logging.getLogger(__name__)
84
 
85
  # FIXME - Setup properly and test/add documentation for its existence...
@@ -117,6 +113,9 @@ print(f"Backup directory: {backup_dir}")
117
 
118
  #
119
  #
 
 
 
120
  #######################################################################################################################
121
  #
122
  # Backup-related functions
@@ -180,6 +179,9 @@ def rotate_backups(backup_dir, max_backups=10):
180
 
181
  #
182
  #
 
 
 
183
  #######################################################################################################################
184
  #
185
  # DB-Integrity Check Functions
@@ -204,88 +206,50 @@ def check_database_integrity(db_path):
204
 
205
  #
206
  # End of DB-Integrity Check functions
 
 
 
207
  #######################################################################################################################
208
  #
209
- # Media-related Functions
210
 
211
- # Custom exceptions
212
  class DatabaseError(Exception):
213
  pass
214
 
215
-
216
  class InputError(Exception):
217
  pass
218
 
219
-
220
- # Database connection function with connection pooling
221
-
222
  class Database:
223
- def __init__(self, db_name=None):
224
- self.db_name = db_name or os.getenv('DB_NAME', 'media_summary.db')
225
- self.db_path = get_database_path(self.db_name)
226
- ensure_directory_exists(os.path.dirname(self.db_path))
227
- self.pool = []
228
- self.pool_size = 10
229
- logging.info(f"Database initialized with path: {self.db_path}")
230
-
231
- @contextmanager
232
- def get_connection(self):
233
- conn = None
234
- try:
235
- conn = self._get_connection_from_pool()
236
- yield conn
237
- except Exception as e:
238
- if conn:
239
- conn.rollback()
240
- raise e
241
- finally:
242
- if conn:
243
- conn.commit()
244
- self.release_connection(conn)
245
 
246
- def _get_connection_from_pool(self):
247
- retry_count = 5
248
- retry_delay = 1
249
- while retry_count > 0:
250
- try:
251
- if self.pool:
252
- return self.pool.pop()
253
- else:
254
- return sqlite3.connect(self.db_path, check_same_thread=False)
255
- except sqlite3.OperationalError as e:
256
- if 'database is locked' in str(e):
257
- logging.warning(f"Database is locked, retrying in {retry_delay} seconds...")
258
- retry_count -= 1
259
- time.sleep(retry_delay)
260
- else:
261
- logging.error(f"Database error: {e}")
262
- raise DatabaseError(f"Database error: {e}")
263
- except Exception as e:
264
- logging.error(f"Unexpected error: {e}")
265
- raise DatabaseError(f"Unexpected error: {e}")
266
- raise DatabaseError("Database is locked and retries have been exhausted")
267
-
268
- def release_connection(self, conn):
269
- if len(self.pool) < self.pool_size:
270
- self.pool.append(conn)
271
- else:
272
- conn.close()
273
 
274
  def execute_query(self, query: str, params: Tuple = ()) -> None:
275
  with self.get_connection() as conn:
276
- cursor = conn.cursor()
277
- cursor.execute(query, params)
278
-
279
- def close_all_connections(self):
280
- for conn in self.pool:
281
- conn.close()
282
- self.pool.clear()
283
- logging.info("All database connections closed")
284
 
 
 
 
 
 
 
 
 
 
285
 
286
  db = Database()
287
 
288
- def instantiate_SQLite_db():
289
  global sqlite_db
290
  sqlite_db = Database()
291
 
@@ -308,7 +272,8 @@ def create_tables(db) -> None:
308
  transcription_model TEXT,
309
  is_trash BOOLEAN DEFAULT 0,
310
  trash_date DATETIME,
311
- vector_embedding BLOB
 
312
  )
313
  ''',
314
  '''
@@ -462,13 +427,21 @@ def create_tables(db) -> None:
462
  create_tables(db)
463
 
464
 
465
- def check_media_exists(title, url):
466
- """Check if media with the given title or URL exists in the database."""
467
- with db.get_connection() as conn:
468
- cursor = conn.cursor()
469
- cursor.execute("SELECT id FROM Media WHERE title = ? OR url = ?", (title, url))
470
- result = cursor.fetchone()
471
- return result is not None
 
 
 
 
 
 
 
 
472
 
473
 
474
  def check_media_and_whisper_model(title=None, url=None, current_whisper_model=None):
@@ -540,7 +513,7 @@ def check_media_and_whisper_model(title=None, url=None, current_whisper_model=No
540
  return False, f"Media found with same whisper model (ID: {media_id})"
541
 
542
 
543
- def sqlite_add_media_chunk(db, media_id: int, chunk_text: str, start_index: int, end_index: int, chunk_id: str):
544
  with db.get_connection() as conn:
545
  cursor = conn.cursor()
546
  cursor.execute(
@@ -562,8 +535,21 @@ def sqlite_get_unprocessed_media(db):
562
  cursor.execute("SELECT id, content, type FROM Media WHERE id NOT IN (SELECT DISTINCT media_id FROM MediaChunks)")
563
  return cursor.fetchall()
564
 
 
 
 
 
 
 
 
 
 
 
565
  #
566
  # End of Media-related Functions
 
 
 
567
  #######################################################################################################################
568
  # Keyword-related Functions
569
  #
@@ -612,8 +598,12 @@ def delete_keyword(keyword: str) -> str:
612
  # Function to add media with keywords
613
  def add_media_with_keywords(url, title, media_type, content, keywords, prompt, summary, transcription_model, author,
614
  ingestion_date):
 
615
  # Set default values for missing fields
616
- url = url or 'Unknown'
 
 
 
617
  title = title or 'Untitled'
618
  media_type = media_type or 'Unknown'
619
  content = content or 'No content available'
@@ -624,10 +614,6 @@ def add_media_with_keywords(url, title, media_type, content, keywords, prompt, s
624
  author = author or 'Unknown'
625
  ingestion_date = ingestion_date or datetime.now().strftime('%Y-%m-%d')
626
 
627
- # Ensure URL is valid
628
- if not is_valid_url(url):
629
- url = 'localhost'
630
-
631
  if media_type not in ['article', 'audio', 'document', 'mediawiki_article', 'mediawiki_dump', 'obsidian_note', 'podcast', 'text', 'video', 'unknown']:
632
  raise InputError("Invalid media type. Allowed types: article, audio file, document, obsidian_note podcast, text, video, unknown.")
633
 
@@ -653,68 +639,64 @@ def add_media_with_keywords(url, title, media_type, content, keywords, prompt, s
653
 
654
  try:
655
  with db.get_connection() as conn:
656
- conn.execute("BEGIN TRANSACTION")
657
  cursor = conn.cursor()
658
 
659
- # Check if media already exists
660
- cursor.execute('SELECT id FROM Media WHERE url = ?', (url,))
661
- existing_media = cursor.fetchone()
662
-
663
- if existing_media:
664
- media_id = existing_media[0]
665
- logging.info(f"Updating existing media with ID: {media_id}")
666
 
 
 
 
667
  cursor.execute('''
668
  UPDATE Media
669
- SET content = ?, transcription_model = ?, title = ?, type = ?, author = ?, ingestion_date = ?
670
  WHERE id = ?
671
- ''', (content, transcription_model, title, media_type, author, ingestion_date, media_id))
672
  else:
673
- logging.info("Creating new media entry")
674
-
675
  cursor.execute('''
676
  INSERT INTO Media (url, title, type, content, author, ingestion_date, transcription_model)
677
  VALUES (?, ?, ?, ?, ?, ?, ?)
678
  ''', (url, title, media_type, content, author, ingestion_date, transcription_model))
679
  media_id = cursor.lastrowid
 
680
 
681
- logging.info(f"Adding new modification to MediaModifications for media ID: {media_id}")
682
  cursor.execute('''
683
  INSERT INTO MediaModifications (media_id, prompt, summary, modification_date)
684
  VALUES (?, ?, ?, ?)
685
  ''', (media_id, prompt, summary, ingestion_date))
686
- logger.info("New modification added to MediaModifications")
687
 
688
- # Insert keywords and associate with media item
689
- logging.info("Processing keywords")
690
- for keyword in keyword_list:
691
- keyword = keyword.strip().lower()
692
- cursor.execute('INSERT OR IGNORE INTO Keywords (keyword) VALUES (?)', (keyword,))
693
- cursor.execute('SELECT id FROM Keywords WHERE keyword = ?', (keyword,))
694
- keyword_id = cursor.fetchone()[0]
695
- cursor.execute('INSERT OR IGNORE INTO MediaKeywords (media_id, keyword_id) VALUES (?, ?)',
696
- (media_id, keyword_id))
 
 
 
697
 
698
  # Update full-text search index
699
- logging.info("Updating full-text search index")
700
  cursor.execute('INSERT OR REPLACE INTO media_fts (rowid, title, content) VALUES (?, ?, ?)',
701
  (media_id, title, content))
702
 
703
- logging.info("Adding new media version")
704
- add_media_version(media_id, prompt, summary)
705
 
706
  conn.commit()
707
  logging.info(f"Media '{title}' successfully added/updated with ID: {media_id}")
708
 
709
- return f"Media '{title}' added/updated successfully with keywords: {', '.join(keyword_list)}"
710
 
711
  except sqlite3.Error as e:
712
- conn.rollback()
713
- logging.error(f"SQL Error: {e}")
714
  raise DatabaseError(f"Error adding media with keywords: {e}")
715
  except Exception as e:
716
- conn.rollback()
717
- logging.error(f"Unexpected Error: {e}")
718
  raise DatabaseError(f"Unexpected error: {e}")
719
 
720
 
@@ -866,6 +848,9 @@ def update_keywords_for_media(media_id, keyword_list):
866
 
867
  #
868
  # End of Keyword-related functions
 
 
 
869
  #######################################################################################################################
870
  #
871
  # Media-related Functions
@@ -922,7 +907,10 @@ def fetch_item_details(media_id: int):
922
  return "", "", ""
923
 
924
  #
925
- #
 
 
 
926
  #######################################################################################################################
927
  #
928
  # Media-related Functions
@@ -930,21 +918,19 @@ def fetch_item_details(media_id: int):
930
 
931
 
932
  # Function to add a version of a prompt and summary
933
- def add_media_version(media_id: int, prompt: str, summary: str) -> None:
934
  try:
935
- with db.get_connection() as conn:
936
- cursor = conn.cursor()
937
 
938
- # Get the current version number
939
- cursor.execute('SELECT MAX(version) FROM MediaVersion WHERE media_id = ?', (media_id,))
940
- current_version = cursor.fetchone()[0] or 0
941
 
942
- # Insert the new version
943
- cursor.execute('''
944
- INSERT INTO MediaVersion (media_id, version, prompt, summary, created_at)
945
- VALUES (?, ?, ?, ?, ?)
946
- ''', (media_id, current_version + 1, prompt, summary, datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
947
- conn.commit()
948
  except DatabaseError as e:
949
  logging.error(f"Error adding media version: {e}")
950
  raise
@@ -1127,7 +1113,6 @@ def is_valid_date(date_string: str) -> bool:
1127
  except ValueError:
1128
  return False
1129
 
1130
-
1131
  def add_media_to_database(url, info_dict, segments, summary, keywords, custom_prompt_input, whisper_model, media_type='video'):
1132
  db = Database()
1133
  try:
@@ -1158,16 +1143,16 @@ def add_media_to_database(url, info_dict, segments, summary, keywords, custom_pr
1158
  media_id = existing_media[0]
1159
  cursor.execute('''
1160
  UPDATE Media
1161
- SET content = ?, transcription_model = ?, title = ?, type = ?, author = ?, ingestion_date = ?
1162
  WHERE id = ?
1163
  ''', (content, whisper_model, info_dict.get('title', 'Untitled'), media_type,
1164
- info_dict.get('uploader', 'Unknown'), datetime.now().strftime('%Y-%m-%d'), media_id))
1165
  else:
1166
  cursor.execute('''
1167
- INSERT INTO Media (url, title, type, content, author, ingestion_date, transcription_model)
1168
- VALUES (?, ?, ?, ?, ?, ?, ?)
1169
  ''', (url, info_dict.get('title', 'Untitled'), media_type, content,
1170
- info_dict.get('uploader', 'Unknown'), datetime.now().strftime('%Y-%m-%d'), whisper_model))
1171
  media_id = cursor.lastrowid
1172
 
1173
  # Add modification
@@ -1194,120 +1179,54 @@ def add_media_to_database(url, info_dict, segments, summary, keywords, custom_pr
1194
  cursor.execute('''
1195
  INSERT INTO MediaVersion (media_id, version, prompt, summary, created_at)
1196
  VALUES (?, ?, ?, ?, ?)
1197
- ''', (
1198
- media_id, current_version + 1, custom_prompt_input, summary, datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
1199
 
1200
- return f"Media '{info_dict.get('title', 'Untitled')}' added/updated successfully with keywords: {', '.join(keyword_list)}"
1201
 
1202
- except sqlite3.Error as e:
 
 
 
 
 
1203
  logging.error(f"Database error: {e}")
1204
- raise DatabaseError(f"Error adding media with keywords: {e}")
1205
  except Exception as e:
1206
  logging.error(f"Unexpected error: {e}")
1207
  raise DatabaseError(f"Unexpected error: {e}")
1208
 
1209
- # Add ingested media to DB
1210
- # def add_media_to_database(url, info_dict, segments, summary, keywords, custom_prompt_input, whisper_model, media_type='video'):
1211
- # max_retries = 5
1212
- # base_delay = 0.1
1213
- #
1214
- # for attempt in range(max_retries):
1215
- # try:
1216
- # with db.get_connection() as conn:
1217
- # cursor = conn.cursor()
1218
- # conn.execute("BEGIN TRANSACTION")
1219
- #
1220
- # try:
1221
- # # Extract content from segments
1222
- # if isinstance(segments, list):
1223
- # content = ' '.join([segment.get('Text', '') for segment in segments if 'Text' in segment])
1224
- # elif isinstance(segments, dict):
1225
- # content = segments.get('text', '') or segments.get('content', '')
1226
- # else:
1227
- # content = str(segments)
1228
- #
1229
- # # Process keywords
1230
- # if isinstance(keywords, str):
1231
- # keyword_list = [keyword.strip().lower() for keyword in keywords.split(',')]
1232
- # elif isinstance(keywords, (list, tuple)):
1233
- # keyword_list = [keyword.strip().lower() for keyword in keywords]
1234
- # else:
1235
- # keyword_list = ['default']
1236
- #
1237
- # # Check if media already exists
1238
- # cursor.execute('SELECT id FROM Media WHERE url = ?', (url,))
1239
- # existing_media = cursor.fetchone()
1240
- #
1241
- # if existing_media:
1242
- # media_id = existing_media[0]
1243
- # cursor.execute('''
1244
- # UPDATE Media
1245
- # SET content = ?, transcription_model = ?, title = ?, type = ?, author = ?, ingestion_date = ?
1246
- # WHERE id = ?
1247
- # ''', (content, whisper_model, info_dict.get('title', 'Untitled'), media_type,
1248
- # info_dict.get('uploader', 'Unknown'), datetime.now().strftime('%Y-%m-%d'), media_id))
1249
- # else:
1250
- # cursor.execute('''
1251
- # INSERT INTO Media (url, title, type, content, author, ingestion_date, transcription_model)
1252
- # VALUES (?, ?, ?, ?, ?, ?, ?)
1253
- # ''', (url, info_dict.get('title', 'Untitled'), media_type, content,
1254
- # info_dict.get('uploader', 'Unknown'), datetime.now().strftime('%Y-%m-%d'), whisper_model))
1255
- # media_id = cursor.lastrowid
1256
- #
1257
- # # Add modification
1258
- # cursor.execute('''
1259
- # INSERT INTO MediaModifications (media_id, prompt, summary, modification_date)
1260
- # VALUES (?, ?, ?, ?)
1261
- # ''', (media_id, custom_prompt_input, summary, datetime.now().strftime('%Y-%m-%d')))
1262
- #
1263
- # # Process keywords
1264
- # for keyword in keyword_list:
1265
- # cursor.execute('INSERT OR IGNORE INTO Keywords (keyword) VALUES (?)', (keyword,))
1266
- # cursor.execute('SELECT id FROM Keywords WHERE keyword = ?', (keyword,))
1267
- # keyword_id = cursor.fetchone()[0]
1268
- # cursor.execute('INSERT OR IGNORE INTO MediaKeywords (media_id, keyword_id) VALUES (?, ?)',
1269
- # (media_id, keyword_id))
1270
- #
1271
- # # Update full-text search index
1272
- # cursor.execute('INSERT OR REPLACE INTO media_fts (rowid, title, content) VALUES (?, ?, ?)',
1273
- # (media_id, info_dict.get('title', 'Untitled'), content))
1274
- #
1275
- # # Add media version
1276
- # cursor.execute('SELECT MAX(version) FROM MediaVersion WHERE media_id = ?', (media_id,))
1277
- # current_version = cursor.fetchone()[0] or 0
1278
- # cursor.execute('''
1279
- # INSERT INTO MediaVersion (media_id, version, prompt, summary, created_at)
1280
- # VALUES (?, ?, ?, ?, ?)
1281
- # ''', (media_id, current_version + 1, custom_prompt_input, summary, datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
1282
- #
1283
- # # Create initial document version
1284
- # create_document_version(media_id, content)
1285
- # # Commit the transaction
1286
- # conn.commit()
1287
- # logging.info(
1288
- # f"Media '{info_dict.get('title', 'Untitled')}' successfully added/updated with ID: {media_id}")
1289
- # return f"Media '{info_dict.get('title', 'Untitled')}' added/updated successfully with keywords: {', '.join(keyword_list)}"
1290
- #
1291
- # except Exception as e:
1292
- # conn.rollback()
1293
- # raise e
1294
- #
1295
- # except sqlite3.OperationalError as e:
1296
- # if 'database is locked' in str(e) and attempt < max_retries - 1:
1297
- # delay = base_delay * (2 ** attempt)
1298
- # logging.warning(f"Database is locked, retrying in {delay:.2f} seconds...")
1299
- # time.sleep(delay)
1300
- # else:
1301
- # logging.error(f"Database error after {attempt + 1} attempts: {e}")
1302
- # raise DatabaseError(f"Error adding media with keywords: {e}")
1303
- # except Exception as e:
1304
- # logging.error(f"Unexpected error: {e}")
1305
- # raise DatabaseError(f"Unexpected error: {e}")
1306
- #
1307
- # raise DatabaseError("Failed to add media to database after multiple attempts")
1308
 
1309
  #
1310
  # End of ....
 
 
 
1311
  #######################################################################################################################
1312
  #
1313
  # Functions to manage prompts DB
@@ -1570,6 +1489,9 @@ def delete_prompt(prompt_id):
1570
 
1571
  #
1572
  #
 
 
 
1573
  #######################################################################################################################
1574
  #
1575
  # Function to fetch/update media content
@@ -1834,6 +1756,9 @@ def search_and_display_items(query, search_type, page, entries_per_page,char_cou
1834
 
1835
  #
1836
  # End of Functions to manage prompts DB / Fetch and update media content
 
 
 
1837
  #######################################################################################################################
1838
  #
1839
  # Obsidian-related Functions
@@ -1899,6 +1824,9 @@ def import_obsidian_note_to_db(note_data):
1899
 
1900
  #
1901
  # End of Obsidian-related Functions
 
 
 
1902
  #######################################################################################################################
1903
  #
1904
  # Chat-related Functions
@@ -2501,6 +2429,9 @@ def get_paginated_files(page: int = 1, results_per_page: int = 50) -> Tuple[List
2501
 
2502
  #
2503
  # End of Functions to handle deletion of media items
 
 
 
2504
  #######################################################################################################################
2505
  #
2506
  # Functions to manage document versions
@@ -2572,4 +2503,153 @@ def get_document_version(media_id: int, version_number: int = None) -> Dict[str,
2572
 
2573
  #
2574
  # End of Functions to manage document versions
2575
- #######################################################################################################################
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
49
  import html
50
  import logging
51
  import os
52
+ import queue
53
  import re
54
  import shutil
55
  import sqlite3
 
56
  import traceback
 
57
  from datetime import datetime, timedelta
58
+ from typing import List, Tuple, Dict, Any, Optional
59
  # Local Libraries
60
+ from App_Function_Libraries.Utils.Utils import get_project_relative_path, get_database_path, \
61
+ get_database_dir
62
+ from App_Function_Libraries.Chunk_Lib import chunk_options, chunk_text
63
+ #
64
  # Third-Party Libraries
65
  import gradio as gr
66
  import pandas as pd
67
  import yaml
 
 
 
68
  #
69
  #######################################################################################################################
70
  # Function Definitions
 
76
  ensure_database_directory()
77
 
78
  # Set up logging
 
 
79
  logger = logging.getLogger(__name__)
80
 
81
  # FIXME - Setup properly and test/add documentation for its existence...
 
113
 
114
  #
115
  #
116
+ #######################################################################################################################
117
+
118
+
119
  #######################################################################################################################
120
  #
121
  # Backup-related functions
 
179
 
180
  #
181
  #
182
+ #######################################################################################################################
183
+
184
+
185
  #######################################################################################################################
186
  #
187
  # DB-Integrity Check Functions
 
206
 
207
  #
208
  # End of DB-Integrity Check functions
209
+ #######################################################################################################################
210
+
211
+
212
  #######################################################################################################################
213
  #
214
+ # DB Setup Functions
215
 
 
216
  class DatabaseError(Exception):
217
  pass
218
 
 
219
  class InputError(Exception):
220
  pass
221
 
 
 
 
222
  class Database:
223
+ def __init__(self, db_name='media_summary.db'):
224
+ self.db_path = get_database_path(db_name)
225
+ self.timeout = 60.0 # 60 seconds timeout
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
226
 
227
+ def get_connection(self):
228
+ return sqlite3.connect(self.db_path, timeout=self.timeout)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
229
 
230
  def execute_query(self, query: str, params: Tuple = ()) -> None:
231
  with self.get_connection() as conn:
232
+ try:
233
+ cursor = conn.cursor()
234
+ cursor.execute(query, params)
235
+ conn.commit()
236
+ except sqlite3.Error as e:
237
+ logging.error(f"Database error: {e}, Query: {query}")
238
+ raise DatabaseError(f"Database error: {e}, Query: {query}")
 
239
 
240
+ def execute_many(self, query: str, params_list: List[Tuple]) -> None:
241
+ with self.get_connection() as conn:
242
+ try:
243
+ cursor = conn.cursor()
244
+ cursor.executemany(query, params_list)
245
+ conn.commit()
246
+ except sqlite3.Error as e:
247
+ logging.error(f"Database error: {e}, Query: {query}")
248
+ raise DatabaseError(f"Database error: {e}, Query: {query}")
249
 
250
  db = Database()
251
 
252
+ def instantiate_sqlite_db():
253
  global sqlite_db
254
  sqlite_db = Database()
255
 
 
272
  transcription_model TEXT,
273
  is_trash BOOLEAN DEFAULT 0,
274
  trash_date DATETIME,
275
+ vector_embedding BLOB,
276
+ chunking_status TEXT DEFAULT 'pending'
277
  )
278
  ''',
279
  '''
 
427
  create_tables(db)
428
 
429
 
430
+ def check_media_exists(title: str, url: str) -> Optional[int]:
431
+ try:
432
+ with db.get_connection() as conn:
433
+ cursor = conn.cursor()
434
+ query = 'SELECT id FROM Media WHERE title = ? AND url = ?'
435
+ cursor.execute(query, (title, url))
436
+ result = cursor.fetchone()
437
+ logging.debug(f"check_media_exists query: {query}")
438
+ logging.debug(f"check_media_exists params: title={title}, url={url}")
439
+ logging.debug(f"check_media_exists result: {result}")
440
+ return result[0] if result else None
441
+ except Exception as e:
442
+ logging.error(f"Error checking if media exists: {str(e)}")
443
+ logging.error(f"Exception details: {traceback.format_exc()}")
444
+ return None
445
 
446
 
447
  def check_media_and_whisper_model(title=None, url=None, current_whisper_model=None):
 
513
  return False, f"Media found with same whisper model (ID: {media_id})"
514
 
515
 
516
+ def add_media_chunk(media_id: int, chunk_text: str, start_index: int, end_index: int, chunk_id: str):
517
  with db.get_connection() as conn:
518
  cursor = conn.cursor()
519
  cursor.execute(
 
535
  cursor.execute("SELECT id, content, type FROM Media WHERE id NOT IN (SELECT DISTINCT media_id FROM MediaChunks)")
536
  return cursor.fetchall()
537
 
538
+ def get_next_media_id():
539
+ try:
540
+ conn = sqlite3.connect(db_path)
541
+ cursor = conn.cursor()
542
+ cursor.execute("SELECT MAX(media_id) FROM media")
543
+ max_id = cursor.fetchone()[0]
544
+ return (max_id or 0) + 1
545
+ finally:
546
+ conn.close()
547
+
548
  #
549
  # End of Media-related Functions
550
+ #######################################################################################################################
551
+
552
+
553
  #######################################################################################################################
554
  # Keyword-related Functions
555
  #
 
598
  # Function to add media with keywords
599
  def add_media_with_keywords(url, title, media_type, content, keywords, prompt, summary, transcription_model, author,
600
  ingestion_date):
601
+ logging.debug(f"Entering add_media_with_keywords: URL={url}, Title={title}")
602
  # Set default values for missing fields
603
+ if url is None:
604
+ url = 'localhost'
605
+ elif url is not None:
606
+ url = url
607
  title = title or 'Untitled'
608
  media_type = media_type or 'Unknown'
609
  content = content or 'No content available'
 
614
  author = author or 'Unknown'
615
  ingestion_date = ingestion_date or datetime.now().strftime('%Y-%m-%d')
616
 
 
 
 
 
617
  if media_type not in ['article', 'audio', 'document', 'mediawiki_article', 'mediawiki_dump', 'obsidian_note', 'podcast', 'text', 'video', 'unknown']:
618
  raise InputError("Invalid media type. Allowed types: article, audio file, document, obsidian_note podcast, text, video, unknown.")
619
 
 
639
 
640
  try:
641
  with db.get_connection() as conn:
 
642
  cursor = conn.cursor()
643
 
644
+ # Check if media already exists using both title and URL
645
+ existing_media_id = check_media_exists(title, url)
646
+ logging.debug(f"Existing media ID for {url}: {existing_media_id}")
 
 
 
 
647
 
648
+ if existing_media_id:
649
+ media_id = existing_media_id
650
+ logging.debug(f"Updating existing media with ID: {media_id}")
651
  cursor.execute('''
652
  UPDATE Media
653
+ SET content = ?, transcription_model = ?, type = ?, author = ?, ingestion_date = ?
654
  WHERE id = ?
655
+ ''', (content, transcription_model, media_type, author, ingestion_date, media_id))
656
  else:
657
+ logging.debug("Inserting new media")
 
658
  cursor.execute('''
659
  INSERT INTO Media (url, title, type, content, author, ingestion_date, transcription_model)
660
  VALUES (?, ?, ?, ?, ?, ?, ?)
661
  ''', (url, title, media_type, content, author, ingestion_date, transcription_model))
662
  media_id = cursor.lastrowid
663
+ logging.debug(f"New media inserted with ID: {media_id}")
664
 
 
665
  cursor.execute('''
666
  INSERT INTO MediaModifications (media_id, prompt, summary, modification_date)
667
  VALUES (?, ?, ?, ?)
668
  ''', (media_id, prompt, summary, ingestion_date))
 
669
 
670
+ # Batch insert keywords
671
+ keyword_params = [(keyword.strip().lower(),) for keyword in keyword_list]
672
+ cursor.executemany('INSERT OR IGNORE INTO Keywords (keyword) VALUES (?)', keyword_params)
673
+
674
+ # Get keyword IDs
675
+ placeholder = ','.join(['?'] * len(keyword_list))
676
+ cursor.execute(f'SELECT id, keyword FROM Keywords WHERE keyword IN ({placeholder})', keyword_list)
677
+ keyword_ids = cursor.fetchall()
678
+
679
+ # Batch insert media-keyword associations
680
+ media_keyword_params = [(media_id, keyword_id) for keyword_id, _ in keyword_ids]
681
+ cursor.executemany('INSERT OR IGNORE INTO MediaKeywords (media_id, keyword_id) VALUES (?, ?)', media_keyword_params)
682
 
683
  # Update full-text search index
 
684
  cursor.execute('INSERT OR REPLACE INTO media_fts (rowid, title, content) VALUES (?, ?, ?)',
685
  (media_id, title, content))
686
 
687
+ # Add media version
688
+ add_media_version(conn, media_id, prompt, summary)
689
 
690
  conn.commit()
691
  logging.info(f"Media '{title}' successfully added/updated with ID: {media_id}")
692
 
693
+ return media_id, f"Media '{title}' added/updated successfully with keywords: {', '.join(keyword_list)}"
694
 
695
  except sqlite3.Error as e:
696
+ logging.error(f"SQL Error in add_media_with_keywords: {e}")
 
697
  raise DatabaseError(f"Error adding media with keywords: {e}")
698
  except Exception as e:
699
+ logging.error(f"Unexpected Error in add_media_with_keywords: {e}")
 
700
  raise DatabaseError(f"Unexpected error: {e}")
701
 
702
 
 
848
 
849
  #
850
  # End of Keyword-related functions
851
+ #######################################################################################################################
852
+
853
+
854
  #######################################################################################################################
855
  #
856
  # Media-related Functions
 
907
  return "", "", ""
908
 
909
  #
910
+ # End of Media-related Functions
911
+ #######################################################################################################################
912
+
913
+
914
  #######################################################################################################################
915
  #
916
  # Media-related Functions
 
918
 
919
 
920
  # Function to add a version of a prompt and summary
921
+ def add_media_version(conn, media_id: int, prompt: str, summary: str) -> None:
922
  try:
923
+ cursor = conn.cursor()
 
924
 
925
+ # Get the current version number
926
+ cursor.execute('SELECT MAX(version) FROM MediaVersion WHERE media_id = ?', (media_id,))
927
+ current_version = cursor.fetchone()[0] or 0
928
 
929
+ # Insert the new version
930
+ cursor.execute('''
931
+ INSERT INTO MediaVersion (media_id, version, prompt, summary, created_at)
932
+ VALUES (?, ?, ?, ?, ?)
933
+ ''', (media_id, current_version + 1, prompt, summary, datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
 
934
  except DatabaseError as e:
935
  logging.error(f"Error adding media version: {e}")
936
  raise
 
1113
  except ValueError:
1114
  return False
1115
 
 
1116
  def add_media_to_database(url, info_dict, segments, summary, keywords, custom_prompt_input, whisper_model, media_type='video'):
1117
  db = Database()
1118
  try:
 
1143
  media_id = existing_media[0]
1144
  cursor.execute('''
1145
  UPDATE Media
1146
+ SET content = ?, transcription_model = ?, title = ?, type = ?, author = ?, ingestion_date = ?, chunking_status = ?
1147
  WHERE id = ?
1148
  ''', (content, whisper_model, info_dict.get('title', 'Untitled'), media_type,
1149
+ info_dict.get('uploader', 'Unknown'), datetime.now().strftime('%Y-%m-%d'), 'pending', media_id))
1150
  else:
1151
  cursor.execute('''
1152
+ INSERT INTO Media (url, title, type, content, author, ingestion_date, transcription_model, chunking_status)
1153
+ VALUES (?, ?, ?, ?, ?, ?, ?, ?)
1154
  ''', (url, info_dict.get('title', 'Untitled'), media_type, content,
1155
+ info_dict.get('uploader', 'Unknown'), datetime.now().strftime('%Y-%m-%d'), whisper_model, 'pending'))
1156
  media_id = cursor.lastrowid
1157
 
1158
  # Add modification
 
1179
  cursor.execute('''
1180
  INSERT INTO MediaVersion (media_id, version, prompt, summary, created_at)
1181
  VALUES (?, ?, ?, ?, ?)
1182
+ ''', (media_id, current_version + 1, custom_prompt_input, summary, datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
 
1183
 
1184
+ conn.commit()
1185
 
1186
+ # Schedule chunking
1187
+ schedule_chunking(media_id, content, info_dict.get('title', 'Untitled'))
1188
+
1189
+ return f"Media '{info_dict.get('title', 'Untitled')}' added/updated successfully with keywords: {', '.join(keyword_list)}. Chunking scheduled."
1190
+
1191
+ except DatabaseError as e:
1192
  logging.error(f"Database error: {e}")
1193
+ raise
1194
  except Exception as e:
1195
  logging.error(f"Unexpected error: {e}")
1196
  raise DatabaseError(f"Unexpected error: {e}")
1197
 
1198
+
1199
+ # FIXME: This function is not complete and needs to be implemented
1200
+ def schedule_chunking(media_id: int, content: str, media_name: str):
1201
+ try:
1202
+ chunks = chunk_text(content, chunk_options['method'], chunk_options['max_size'], chunk_options['overlap'])
1203
+ db = Database()
1204
+ with db.get_connection() as conn:
1205
+ cursor = conn.cursor()
1206
+ for i, chunk in enumerate(chunks):
1207
+ cursor.execute('''
1208
+ INSERT INTO MediaChunks (media_id, chunk_text, start_index, end_index, chunk_id)
1209
+ VALUES (?, ?, ?, ?, ?)
1210
+ ''', (media_id, chunk, i * chunk_options['max_size'],
1211
+ min((i + 1) * chunk_options['max_size'], len(content)),
1212
+ f"{media_id}_chunk_{i}"))
1213
+ conn.commit()
1214
+
1215
+ # Update chunking status
1216
+ with db.get_connection() as conn:
1217
+ cursor = conn.cursor()
1218
+ cursor.execute("UPDATE Media SET chunking_status = 'completed' WHERE id = ?", (media_id,))
1219
+ conn.commit()
1220
+
1221
+ except Exception as e:
1222
+ logging.error(f"Error scheduling chunking for media_id {media_id}: {str(e)}")
1223
+ # You might want to update the chunking_status to 'failed' here
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1224
 
1225
  #
1226
  # End of ....
1227
+ #######################################################################################################################
1228
+
1229
+
1230
  #######################################################################################################################
1231
  #
1232
  # Functions to manage prompts DB
 
1489
 
1490
  #
1491
  #
1492
+ #######################################################################################################################
1493
+
1494
+
1495
  #######################################################################################################################
1496
  #
1497
  # Function to fetch/update media content
 
1756
 
1757
  #
1758
  # End of Functions to manage prompts DB / Fetch and update media content
1759
+ #######################################################################################################################
1760
+
1761
+
1762
  #######################################################################################################################
1763
  #
1764
  # Obsidian-related Functions
 
1824
 
1825
  #
1826
  # End of Obsidian-related Functions
1827
+ #######################################################################################################################
1828
+
1829
+
1830
  #######################################################################################################################
1831
  #
1832
  # Chat-related Functions
 
2429
 
2430
  #
2431
  # End of Functions to handle deletion of media items
2432
+ #######################################################################################################################
2433
+
2434
+
2435
  #######################################################################################################################
2436
  #
2437
  # Functions to manage document versions
 
2503
 
2504
  #
2505
  # End of Functions to manage document versions
2506
+ #######################################################################################################################
2507
+
2508
+
2509
+ #######################################################################################################################
2510
+ #
2511
+ # Functions to manage media chunks
2512
+
2513
+ def process_chunks(database, chunks: List[Dict], media_id: int, batch_size: int = 100):
2514
+ """
2515
+ Process chunks in batches and insert them into the database.
2516
+
2517
+ :param database: Database instance to use for inserting chunks
2518
+ :param chunks: List of chunk dictionaries
2519
+ :param media_id: ID of the media these chunks belong to
2520
+ :param batch_size: Number of chunks to process in each batch
2521
+ """
2522
+ total_chunks = len(chunks)
2523
+ processed_chunks = 0
2524
+
2525
+ for i in range(0, total_chunks, batch_size):
2526
+ batch = chunks[i:i + batch_size]
2527
+ chunk_data = [
2528
+ (media_id, chunk['text'], chunk['start_index'], chunk['end_index'])
2529
+ for chunk in batch
2530
+ ]
2531
+
2532
+ try:
2533
+ database.execute_many(
2534
+ "INSERT INTO MediaChunks (media_id, chunk_text, start_index, end_index) VALUES (?, ?, ?, ?)",
2535
+ chunk_data
2536
+ )
2537
+ processed_chunks += len(batch)
2538
+ logging.info(f"Processed {processed_chunks}/{total_chunks} chunks for media_id {media_id}")
2539
+ except Exception as e:
2540
+ logging.error(f"Error inserting chunk batch for media_id {media_id}: {e}")
2541
+ # Optionally, you could raise an exception here to stop processing
2542
+ # raise
2543
+
2544
+ logging.info(f"Finished processing all {total_chunks} chunks for media_id {media_id}")
2545
+
2546
+
2547
+ # Usage example:
2548
+ # chunks = [{'text': 'chunk1', 'start_index': 0, 'end_index': 10}, ...]
2549
+ # process_chunks(db, chunks, media_id=1, batch_size=100)
2550
+
2551
+ def batch_insert_chunks(conn, chunks, media_id):
2552
+ cursor = conn.cursor()
2553
+ chunk_data = [(
2554
+ media_id,
2555
+ chunk['text'],
2556
+ chunk['metadata']['start_index'],
2557
+ chunk['metadata']['end_index'],
2558
+ f"{media_id}_chunk_{i}"
2559
+ ) for i, chunk in enumerate(chunks, 1)]
2560
+
2561
+ cursor.executemany('''
2562
+ INSERT INTO MediaChunks (media_id, chunk_text, start_index, end_index, chunk_id)
2563
+ VALUES (?, ?, ?, ?, ?)
2564
+ ''', chunk_data)
2565
+
2566
+
2567
+ chunk_queue = queue.Queue()
2568
+
2569
+ def chunk_processor():
2570
+ while True:
2571
+ chunk_batch = chunk_queue.get()
2572
+ if chunk_batch is None:
2573
+ break
2574
+ try:
2575
+ with db.get_connection() as conn:
2576
+ conn.execute("BEGIN TRANSACTION")
2577
+ try:
2578
+ batch_insert_chunks(conn, chunk_batch['chunks'], chunk_batch['media_id'])
2579
+ conn.commit()
2580
+ except Exception as e:
2581
+ conn.rollback()
2582
+ logging.error(f"Error in batch insert: {str(e)}")
2583
+ except Exception as e:
2584
+ logging.error(f"Error processing chunk batch: {str(e)}")
2585
+ finally:
2586
+ chunk_queue.task_done()
2587
+
2588
+ # Start the chunk processor thread
2589
+ #chunk_processor_thread = threading.Thread(target=chunk_processor)
2590
+ #chunk_processor_thread.start()
2591
+
2592
+ # Make sure to properly shut down the chunk processor when your application exits
2593
+ # def shutdown_chunk_processor():
2594
+ # chunk_queue.put(None)
2595
+ # chunk_processor_thread.join()
2596
+
2597
+ #FIXME - add into main db creation code
2598
+ def update_media_chunks_table():
2599
+ with db.get_connection() as conn:
2600
+ cursor = conn.cursor()
2601
+ cursor.execute('''
2602
+ CREATE TABLE IF NOT EXISTS MediaChunks_new (
2603
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
2604
+ media_id INTEGER,
2605
+ chunk_text TEXT,
2606
+ start_index INTEGER,
2607
+ end_index INTEGER,
2608
+ chunk_id TEXT,
2609
+ FOREIGN KEY (media_id) REFERENCES Media(id)
2610
+ )
2611
+ ''')
2612
+ cursor.execute('''
2613
+ INSERT INTO MediaChunks_new (media_id, chunk_text, start_index, end_index)
2614
+ SELECT media_id, chunk_text, start_index, end_index FROM MediaChunks
2615
+ ''')
2616
+ cursor.execute('DROP TABLE MediaChunks')
2617
+ cursor.execute('ALTER TABLE MediaChunks_new RENAME TO MediaChunks')
2618
+
2619
+ logger.info("Updated MediaChunks table schema")
2620
+
2621
+ update_media_chunks_table()
2622
+ # Above function is a dirty hack that should be merged into the initial DB creation statement. This is a placeholder
2623
+ # FIXME
2624
+
2625
+
2626
+ # This is backwards compatibility for older setups.
2627
+ # Function to add a missing column to the Media table
2628
+ def add_missing_column_if_not_exists(db, table_name, column_name, column_definition):
2629
+ try:
2630
+ # Check if the column already exists in the table
2631
+ cursor = db.cursor()
2632
+ cursor.execute(f"PRAGMA table_info({table_name})")
2633
+ columns = [column[1] for column in cursor.fetchall()]
2634
+
2635
+ # If the column is not found, add it
2636
+ if column_name not in columns:
2637
+ logging.info(f"Adding missing column '{column_name}' to table '{table_name}'")
2638
+ cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN {column_definition}")
2639
+ db.commit()
2640
+ logging.info(f"Column '{column_name}' added successfully.")
2641
+ else:
2642
+ logging.info(f"Column '{column_name}' already exists in table '{table_name}'")
2643
+
2644
+ except sqlite3.Error as e:
2645
+ logging.error(f"Error checking or adding column '{column_name}' in table '{table_name}': {e}")
2646
+ raise
2647
+
2648
+ # Example usage of the function
2649
+ def update_media_table(db):
2650
+ # Add chunking_status column if it doesn't exist
2651
+ add_missing_column_if_not_exists(db, 'Media', 'chunking_status', "TEXT DEFAULT 'pending'")
2652
+
2653
+ #
2654
+ # End of Functions to manage media chunks
2655
+ #######################################################################################################################