Spaces:
Running
Running
oceansweep
commited on
Commit
•
66a2900
1
Parent(s):
0215147
Upload 2 files
Browse files- App_Function_Libraries/DB/DB_Manager.py +74 -141
- App_Function_Libraries/DB/SQLite_DB.py +323 -243
App_Function_Libraries/DB/DB_Manager.py
CHANGED
@@ -5,16 +5,14 @@
|
|
5 |
import configparser
|
6 |
import os
|
7 |
import logging
|
8 |
-
import threading
|
9 |
-
from contextlib import contextmanager
|
10 |
from typing import Tuple, List, Union, Dict
|
11 |
-
import sqlite3
|
12 |
import time
|
13 |
#
|
14 |
# 3rd-Party Libraries
|
15 |
from elasticsearch import Elasticsearch
|
16 |
#
|
17 |
# Import your existing SQLite functions
|
|
|
18 |
from App_Function_Libraries.DB.SQLite_DB import (
|
19 |
update_media_content as sqlite_update_media_content,
|
20 |
list_prompts as sqlite_list_prompts,
|
@@ -49,166 +47,93 @@ from App_Function_Libraries.DB.SQLite_DB import (
|
|
49 |
search_and_display_items as sqlite_search_and_display_items,
|
50 |
get_conversation_name as sqlite_get_conversation_name,
|
51 |
add_media_with_keywords as sqlite_add_media_with_keywords,
|
52 |
-
check_media_and_whisper_model as sqlite_check_media_and_whisper_model,
|
53 |
-
|
54 |
-
get_document_version as sqlite_get_document_version, sqlite_search_db, sqlite_add_media_chunk,
|
55 |
sqlite_update_fts_for_media, sqlite_get_unprocessed_media, fetch_item_details as sqlite_fetch_item_details, \
|
56 |
search_media_database as sqlite_search_media_database, mark_as_trash as sqlite_mark_as_trash, \
|
57 |
get_media_transcripts as sqlite_get_media_transcripts, get_specific_transcript as sqlite_get_specific_transcript, \
|
58 |
get_media_summaries as sqlite_get_media_summaries, get_specific_summary as sqlite_get_specific_summary, \
|
59 |
get_media_prompts as sqlite_get_media_prompts, get_specific_prompt as sqlite_get_specific_prompt, \
|
60 |
-
delete_specific_transcript as sqlite_delete_specific_transcript,
|
61 |
-
|
|
|
|
|
62 |
update_keywords_for_media as sqlite_update_keywords_for_media, check_media_exists as sqlite_check_media_exists, \
|
63 |
search_prompts as sqlite_search_prompts, get_media_content as sqlite_get_media_content, \
|
64 |
get_paginated_files as sqlite_get_paginated_files, get_media_title as sqlite_get_media_title, \
|
65 |
get_all_content_from_database as sqlite_get_all_content_from_database,
|
|
|
|
|
66 |
)
|
67 |
#
|
68 |
# Local Imports
|
69 |
from App_Function_Libraries.Utils.Utils import load_comprehensive_config, get_database_path, get_project_relative_path
|
70 |
#
|
71 |
# End of imports
|
|
|
|
|
|
|
72 |
############################################################################################################
|
73 |
#
|
74 |
-
#
|
|
|
|
|
75 |
|
76 |
-
# Load configuration from config file
|
77 |
config_path = get_project_relative_path('Config_Files/config.txt')
|
78 |
config = configparser.ConfigParser()
|
79 |
config.read(config_path)
|
80 |
|
81 |
db_path: str = config.get('Database', 'sqlite_path', fallback='./Databases/media_summary.db')
|
82 |
-
|
83 |
backup_path: str = config.get('Database', 'backup_path', fallback='database_backups')
|
84 |
-
|
85 |
backup_dir: Union[str, bytes] = os.environ.get('DB_BACKUP_DIR', backup_path)
|
86 |
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
#
|
91 |
-
# Database Manager Class
|
92 |
|
93 |
-
|
94 |
-
|
|
|
95 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
96 |
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
-
@contextmanager
|
106 |
-
def get_connection(self):
|
107 |
-
retry_count = 5
|
108 |
-
retry_delay = 1
|
109 |
-
while retry_count > 0:
|
110 |
-
try:
|
111 |
-
if self.pool:
|
112 |
-
conn = self.pool.pop()
|
113 |
-
else:
|
114 |
-
conn = sqlite3.connect(self.db_path, timeout=self.timeout, check_same_thread=False)
|
115 |
-
conn.execute("PRAGMA journal_mode=WAL;") # Enable WAL mode
|
116 |
-
yield conn
|
117 |
-
self.pool.append(conn)
|
118 |
-
return
|
119 |
-
except sqlite3.OperationalError as e:
|
120 |
-
if 'database is locked' in str(e):
|
121 |
-
logger.warning(f"Database is locked, retrying in {retry_delay} seconds...")
|
122 |
-
retry_count -= 1
|
123 |
-
time.sleep(retry_delay)
|
124 |
-
retry_delay *= 2 # Exponential backoff
|
125 |
-
else:
|
126 |
-
raise DatabaseError(f"Database error: {e}")
|
127 |
-
except Exception as e:
|
128 |
-
raise DatabaseError(f"Unexpected error: {e}")
|
129 |
-
raise DatabaseError("Database is locked and retries have been exhausted")
|
130 |
-
|
131 |
-
def execute_query(self, query: str, params: Tuple = ()) -> None:
|
132 |
-
with self.lock: # Use a global lock for write operations
|
133 |
-
with self.get_connection() as conn:
|
134 |
-
try:
|
135 |
-
cursor = conn.cursor()
|
136 |
-
cursor.execute(query, params)
|
137 |
-
conn.commit()
|
138 |
-
except sqlite3.Error as e:
|
139 |
-
logger.error(f"Database error: {e}, Query: {query}")
|
140 |
-
raise DatabaseError(f"Database error: {e}, Query: {query}")
|
141 |
-
|
142 |
-
def execute_many(self, query: str, params_list: List[Tuple]) -> None:
|
143 |
-
with self.lock: # Use a global lock for write operations
|
144 |
-
with self.get_connection() as conn:
|
145 |
-
try:
|
146 |
-
cursor = conn.cursor()
|
147 |
-
cursor.executemany(query, params_list)
|
148 |
-
conn.commit()
|
149 |
-
except sqlite3.Error as e:
|
150 |
-
logger.error(f"Database error: {e}, Query: {query}")
|
151 |
-
raise DatabaseError(f"Database error: {e}, Query: {query}")
|
152 |
-
|
153 |
-
def close_all_connections(self):
|
154 |
-
for conn in self.pool:
|
155 |
-
conn.close()
|
156 |
-
self.pool.clear()
|
157 |
|
|
|
|
|
|
|
|
|
|
|
158 |
|
159 |
-
|
160 |
-
|
161 |
-
# def __init__(self, db_name='media_summary.db'):
|
162 |
-
# self.db_path = get_database_path(db_name)
|
163 |
-
# self.pool = []
|
164 |
-
# self.pool_size = 10
|
165 |
-
#
|
166 |
-
# @contextmanager
|
167 |
-
# def get_connection(self):
|
168 |
-
# retry_count = 5
|
169 |
-
# retry_delay = 1
|
170 |
-
# while retry_count > 0:
|
171 |
-
# try:
|
172 |
-
# if self.pool:
|
173 |
-
# conn = self.pool.pop()
|
174 |
-
# else:
|
175 |
-
# conn = sqlite3.connect(self.db_path, check_same_thread=False)
|
176 |
-
# yield conn
|
177 |
-
# self.pool.append(conn)
|
178 |
-
# return
|
179 |
-
# except sqlite3.OperationalError as e:
|
180 |
-
# if 'database is locked' in str(e):
|
181 |
-
# logger.warning(f"Database is locked, retrying in {retry_delay} seconds...")
|
182 |
-
# retry_count -= 1
|
183 |
-
# time.sleep(retry_delay)
|
184 |
-
# retry_delay *= 2 # Exponential backoff
|
185 |
-
# else:
|
186 |
-
# raise DatabaseError(f"Database error: {e}")
|
187 |
-
# except Exception as e:
|
188 |
-
# raise DatabaseError(f"Unexpected error: {e}")
|
189 |
-
# raise DatabaseError("Database is locked and retries have been exhausted")
|
190 |
-
#
|
191 |
-
# def execute_query(self, query: str, params: Tuple = ()) -> None:
|
192 |
-
# with self.get_connection() as conn:
|
193 |
-
# try:
|
194 |
-
# cursor = conn.cursor()
|
195 |
-
# cursor.execute(query, params)
|
196 |
-
# conn.commit()
|
197 |
-
# except sqlite3.Error as e:
|
198 |
-
# logger.error(f"Database error: {e}, Query: {query}")
|
199 |
-
# raise DatabaseError(f"Database error: {e}, Query: {query}")
|
200 |
-
#
|
201 |
-
# def close_all_connections(self):
|
202 |
-
# for conn in self.pool:
|
203 |
-
# conn.close()
|
204 |
-
# self.pool.clear()
|
205 |
-
#
|
206 |
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
|
|
|
|
|
|
212 |
|
213 |
def get_db_config():
|
214 |
try:
|
@@ -335,6 +260,12 @@ def get_media_title(*args, **kwargs):
|
|
335 |
# Implement Elasticsearch version
|
336 |
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
337 |
|
|
|
|
|
|
|
|
|
|
|
|
|
338 |
|
339 |
#
|
340 |
# End of DB-Searching functions
|
@@ -441,9 +372,18 @@ def ingest_article_to_db(url, title, author, content, keywords, summary, ingesti
|
|
441 |
raise ValueError(f"Unsupported database type: {db_type}")
|
442 |
|
443 |
|
444 |
-
def add_media_chunk(
|
445 |
if db_type == 'sqlite':
|
446 |
-
sqlite_add_media_chunk(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
447 |
elif db_type == 'elasticsearch':
|
448 |
# Implement Elasticsearch version
|
449 |
raise NotImplementedError("Elasticsearch version not yet implemented")
|
@@ -850,16 +790,9 @@ def get_document_version(*args, **kwargs):
|
|
850 |
# End of Document Versioning Functions
|
851 |
############################################################################################################
|
852 |
|
853 |
-
|
854 |
-
|
855 |
-
############################################################################################################
|
856 |
-
#
|
857 |
-
# Function to close the database connection for SQLite
|
858 |
-
|
859 |
def close_connection():
|
860 |
if db_type == 'sqlite':
|
861 |
-
db.
|
862 |
-
# Elasticsearch doesn't need explicit closing
|
863 |
|
864 |
#
|
865 |
# End of file
|
|
|
5 |
import configparser
|
6 |
import os
|
7 |
import logging
|
|
|
|
|
8 |
from typing import Tuple, List, Union, Dict
|
|
|
9 |
import time
|
10 |
#
|
11 |
# 3rd-Party Libraries
|
12 |
from elasticsearch import Elasticsearch
|
13 |
#
|
14 |
# Import your existing SQLite functions
|
15 |
+
from App_Function_Libraries.DB.SQLite_DB import DatabaseError
|
16 |
from App_Function_Libraries.DB.SQLite_DB import (
|
17 |
update_media_content as sqlite_update_media_content,
|
18 |
list_prompts as sqlite_list_prompts,
|
|
|
47 |
search_and_display_items as sqlite_search_and_display_items,
|
48 |
get_conversation_name as sqlite_get_conversation_name,
|
49 |
add_media_with_keywords as sqlite_add_media_with_keywords,
|
50 |
+
check_media_and_whisper_model as sqlite_check_media_and_whisper_model, \
|
51 |
+
create_document_version as sqlite_create_document_version,
|
52 |
+
get_document_version as sqlite_get_document_version, sqlite_search_db, add_media_chunk as sqlite_add_media_chunk,
|
53 |
sqlite_update_fts_for_media, sqlite_get_unprocessed_media, fetch_item_details as sqlite_fetch_item_details, \
|
54 |
search_media_database as sqlite_search_media_database, mark_as_trash as sqlite_mark_as_trash, \
|
55 |
get_media_transcripts as sqlite_get_media_transcripts, get_specific_transcript as sqlite_get_specific_transcript, \
|
56 |
get_media_summaries as sqlite_get_media_summaries, get_specific_summary as sqlite_get_specific_summary, \
|
57 |
get_media_prompts as sqlite_get_media_prompts, get_specific_prompt as sqlite_get_specific_prompt, \
|
58 |
+
delete_specific_transcript as sqlite_delete_specific_transcript,
|
59 |
+
delete_specific_summary as sqlite_delete_specific_summary, \
|
60 |
+
delete_specific_prompt as sqlite_delete_specific_prompt,
|
61 |
+
fetch_keywords_for_media as sqlite_fetch_keywords_for_media, \
|
62 |
update_keywords_for_media as sqlite_update_keywords_for_media, check_media_exists as sqlite_check_media_exists, \
|
63 |
search_prompts as sqlite_search_prompts, get_media_content as sqlite_get_media_content, \
|
64 |
get_paginated_files as sqlite_get_paginated_files, get_media_title as sqlite_get_media_title, \
|
65 |
get_all_content_from_database as sqlite_get_all_content_from_database,
|
66 |
+
get_next_media_id as sqlite_get_next_media_id, \
|
67 |
+
batch_insert_chunks as sqlite_batch_insert_chunks, Database,
|
68 |
)
|
69 |
#
|
70 |
# Local Imports
|
71 |
from App_Function_Libraries.Utils.Utils import load_comprehensive_config, get_database_path, get_project_relative_path
|
72 |
#
|
73 |
# End of imports
|
74 |
+
############################################################################################################
|
75 |
+
|
76 |
+
|
77 |
############################################################################################################
|
78 |
#
|
79 |
+
# Database Config loading
|
80 |
+
|
81 |
+
logger = logging.getLogger(__name__)
|
82 |
|
|
|
83 |
config_path = get_project_relative_path('Config_Files/config.txt')
|
84 |
config = configparser.ConfigParser()
|
85 |
config.read(config_path)
|
86 |
|
87 |
db_path: str = config.get('Database', 'sqlite_path', fallback='./Databases/media_summary.db')
|
|
|
88 |
backup_path: str = config.get('Database', 'backup_path', fallback='database_backups')
|
|
|
89 |
backup_dir: Union[str, bytes] = os.environ.get('DB_BACKUP_DIR', backup_path)
|
90 |
|
91 |
+
def get_db_config():
|
92 |
+
try:
|
93 |
+
config = load_comprehensive_config()
|
|
|
|
|
94 |
|
95 |
+
if 'Database' not in config:
|
96 |
+
print("Warning: 'Database' section not found in config. Using default values.")
|
97 |
+
return default_db_config()
|
98 |
|
99 |
+
return {
|
100 |
+
'type': config.get('Database', 'type', fallback='sqlite'),
|
101 |
+
'sqlite_path': config.get('Database', 'sqlite_path', fallback='Databases/media_summary.db'),
|
102 |
+
'elasticsearch_host': config.get('Database', 'elasticsearch_host', fallback='localhost'),
|
103 |
+
'elasticsearch_port': config.getint('Database', 'elasticsearch_port', fallback=9200)
|
104 |
+
}
|
105 |
+
except FileNotFoundError:
|
106 |
+
print("Warning: Config file not found. Using default database configuration.")
|
107 |
+
return default_db_config()
|
108 |
+
except Exception as e:
|
109 |
+
print(f"Error reading config: {str(e)}. Using default database configuration.")
|
110 |
+
return default_db_config()
|
111 |
|
112 |
+
def default_db_config():
|
113 |
+
return {
|
114 |
+
'type': 'sqlite',
|
115 |
+
'sqlite_path': get_database_path('media_summary.db'),
|
116 |
+
'elasticsearch_host': 'localhost',
|
117 |
+
'elasticsearch_port': 9200
|
118 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
|
120 |
+
def ensure_directory_exists(file_path):
|
121 |
+
directory = os.path.dirname(file_path)
|
122 |
+
if not os.path.exists(directory):
|
123 |
+
os.makedirs(directory)
|
124 |
+
print(f"Created directory: {directory}")
|
125 |
|
126 |
+
db_config = get_db_config()
|
127 |
+
db_type = db_config['type']
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
128 |
|
129 |
+
if db_type == 'sqlite':
|
130 |
+
db = Database(os.path.basename(db_config['sqlite_path']))
|
131 |
+
elif db_type == 'elasticsearch':
|
132 |
+
raise NotImplementedError("Elasticsearch support not yet implemented")
|
133 |
+
else:
|
134 |
+
raise ValueError(f"Unsupported database type: {db_type}")
|
135 |
+
|
136 |
+
print(f"Database path: {db.db_path}")
|
137 |
|
138 |
def get_db_config():
|
139 |
try:
|
|
|
260 |
# Implement Elasticsearch version
|
261 |
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
262 |
|
263 |
+
def get_next_media_id():
|
264 |
+
if db_type == 'sqlite':
|
265 |
+
return sqlite_get_next_media_id()
|
266 |
+
elif db_type == 'elasticsearch':
|
267 |
+
# Implement Elasticsearch version
|
268 |
+
raise NotImplementedError("Elasticsearch version of add_media_with_keywords not yet implemented")
|
269 |
|
270 |
#
|
271 |
# End of DB-Searching functions
|
|
|
372 |
raise ValueError(f"Unsupported database type: {db_type}")
|
373 |
|
374 |
|
375 |
+
def add_media_chunk(*args, **kwargs):
|
376 |
if db_type == 'sqlite':
|
377 |
+
sqlite_add_media_chunk(*args, **kwargs)
|
378 |
+
elif db_type == 'elasticsearch':
|
379 |
+
# Implement Elasticsearch version
|
380 |
+
raise NotImplementedError("Elasticsearch version not yet implemented")
|
381 |
+
else:
|
382 |
+
raise ValueError(f"Unsupported database type: {db_type}")
|
383 |
+
|
384 |
+
def batch_insert_chunks(*args, **kwargs):
|
385 |
+
if db_type == 'sqlite':
|
386 |
+
sqlite_batch_insert_chunks(*args, **kwargs)
|
387 |
elif db_type == 'elasticsearch':
|
388 |
# Implement Elasticsearch version
|
389 |
raise NotImplementedError("Elasticsearch version not yet implemented")
|
|
|
790 |
# End of Document Versioning Functions
|
791 |
############################################################################################################
|
792 |
|
|
|
|
|
|
|
|
|
|
|
|
|
793 |
def close_connection():
|
794 |
if db_type == 'sqlite':
|
795 |
+
db.get_connection().close()
|
|
|
796 |
|
797 |
#
|
798 |
# End of file
|
App_Function_Libraries/DB/SQLite_DB.py
CHANGED
@@ -49,24 +49,22 @@ import csv
|
|
49 |
import html
|
50 |
import logging
|
51 |
import os
|
|
|
52 |
import re
|
53 |
import shutil
|
54 |
import sqlite3
|
55 |
-
import time
|
56 |
import traceback
|
57 |
-
from contextlib import contextmanager
|
58 |
from datetime import datetime, timedelta
|
59 |
-
from typing import List, Tuple, Dict, Any
|
60 |
# Local Libraries
|
61 |
-
from App_Function_Libraries.Utils.Utils import
|
62 |
-
get_database_dir
|
|
|
|
|
63 |
# Third-Party Libraries
|
64 |
import gradio as gr
|
65 |
import pandas as pd
|
66 |
import yaml
|
67 |
-
|
68 |
-
|
69 |
-
# Import Local Libraries
|
70 |
#
|
71 |
#######################################################################################################################
|
72 |
# Function Definitions
|
@@ -78,8 +76,6 @@ def ensure_database_directory():
|
|
78 |
ensure_database_directory()
|
79 |
|
80 |
# Set up logging
|
81 |
-
#logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
82 |
-
#logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
|
83 |
logger = logging.getLogger(__name__)
|
84 |
|
85 |
# FIXME - Setup properly and test/add documentation for its existence...
|
@@ -117,6 +113,9 @@ print(f"Backup directory: {backup_dir}")
|
|
117 |
|
118 |
#
|
119 |
#
|
|
|
|
|
|
|
120 |
#######################################################################################################################
|
121 |
#
|
122 |
# Backup-related functions
|
@@ -180,6 +179,9 @@ def rotate_backups(backup_dir, max_backups=10):
|
|
180 |
|
181 |
#
|
182 |
#
|
|
|
|
|
|
|
183 |
#######################################################################################################################
|
184 |
#
|
185 |
# DB-Integrity Check Functions
|
@@ -204,88 +206,50 @@ def check_database_integrity(db_path):
|
|
204 |
|
205 |
#
|
206 |
# End of DB-Integrity Check functions
|
|
|
|
|
|
|
207 |
#######################################################################################################################
|
208 |
#
|
209 |
-
#
|
210 |
|
211 |
-
# Custom exceptions
|
212 |
class DatabaseError(Exception):
|
213 |
pass
|
214 |
|
215 |
-
|
216 |
class InputError(Exception):
|
217 |
pass
|
218 |
|
219 |
-
|
220 |
-
# Database connection function with connection pooling
|
221 |
-
|
222 |
class Database:
|
223 |
-
def __init__(self, db_name=
|
224 |
-
self.
|
225 |
-
self.
|
226 |
-
ensure_directory_exists(os.path.dirname(self.db_path))
|
227 |
-
self.pool = []
|
228 |
-
self.pool_size = 10
|
229 |
-
logging.info(f"Database initialized with path: {self.db_path}")
|
230 |
-
|
231 |
-
@contextmanager
|
232 |
-
def get_connection(self):
|
233 |
-
conn = None
|
234 |
-
try:
|
235 |
-
conn = self._get_connection_from_pool()
|
236 |
-
yield conn
|
237 |
-
except Exception as e:
|
238 |
-
if conn:
|
239 |
-
conn.rollback()
|
240 |
-
raise e
|
241 |
-
finally:
|
242 |
-
if conn:
|
243 |
-
conn.commit()
|
244 |
-
self.release_connection(conn)
|
245 |
|
246 |
-
def
|
247 |
-
|
248 |
-
retry_delay = 1
|
249 |
-
while retry_count > 0:
|
250 |
-
try:
|
251 |
-
if self.pool:
|
252 |
-
return self.pool.pop()
|
253 |
-
else:
|
254 |
-
return sqlite3.connect(self.db_path, check_same_thread=False)
|
255 |
-
except sqlite3.OperationalError as e:
|
256 |
-
if 'database is locked' in str(e):
|
257 |
-
logging.warning(f"Database is locked, retrying in {retry_delay} seconds...")
|
258 |
-
retry_count -= 1
|
259 |
-
time.sleep(retry_delay)
|
260 |
-
else:
|
261 |
-
logging.error(f"Database error: {e}")
|
262 |
-
raise DatabaseError(f"Database error: {e}")
|
263 |
-
except Exception as e:
|
264 |
-
logging.error(f"Unexpected error: {e}")
|
265 |
-
raise DatabaseError(f"Unexpected error: {e}")
|
266 |
-
raise DatabaseError("Database is locked and retries have been exhausted")
|
267 |
-
|
268 |
-
def release_connection(self, conn):
|
269 |
-
if len(self.pool) < self.pool_size:
|
270 |
-
self.pool.append(conn)
|
271 |
-
else:
|
272 |
-
conn.close()
|
273 |
|
274 |
def execute_query(self, query: str, params: Tuple = ()) -> None:
|
275 |
with self.get_connection() as conn:
|
276 |
-
|
277 |
-
|
278 |
-
|
279 |
-
|
280 |
-
|
281 |
-
|
282 |
-
|
283 |
-
logging.info("All database connections closed")
|
284 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
285 |
|
286 |
db = Database()
|
287 |
|
288 |
-
def
|
289 |
global sqlite_db
|
290 |
sqlite_db = Database()
|
291 |
|
@@ -308,7 +272,8 @@ def create_tables(db) -> None:
|
|
308 |
transcription_model TEXT,
|
309 |
is_trash BOOLEAN DEFAULT 0,
|
310 |
trash_date DATETIME,
|
311 |
-
vector_embedding BLOB
|
|
|
312 |
)
|
313 |
''',
|
314 |
'''
|
@@ -462,13 +427,21 @@ def create_tables(db) -> None:
|
|
462 |
create_tables(db)
|
463 |
|
464 |
|
465 |
-
def check_media_exists(title, url):
|
466 |
-
|
467 |
-
|
468 |
-
|
469 |
-
|
470 |
-
|
471 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
472 |
|
473 |
|
474 |
def check_media_and_whisper_model(title=None, url=None, current_whisper_model=None):
|
@@ -540,7 +513,7 @@ def check_media_and_whisper_model(title=None, url=None, current_whisper_model=No
|
|
540 |
return False, f"Media found with same whisper model (ID: {media_id})"
|
541 |
|
542 |
|
543 |
-
def
|
544 |
with db.get_connection() as conn:
|
545 |
cursor = conn.cursor()
|
546 |
cursor.execute(
|
@@ -562,8 +535,21 @@ def sqlite_get_unprocessed_media(db):
|
|
562 |
cursor.execute("SELECT id, content, type FROM Media WHERE id NOT IN (SELECT DISTINCT media_id FROM MediaChunks)")
|
563 |
return cursor.fetchall()
|
564 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
565 |
#
|
566 |
# End of Media-related Functions
|
|
|
|
|
|
|
567 |
#######################################################################################################################
|
568 |
# Keyword-related Functions
|
569 |
#
|
@@ -612,8 +598,12 @@ def delete_keyword(keyword: str) -> str:
|
|
612 |
# Function to add media with keywords
|
613 |
def add_media_with_keywords(url, title, media_type, content, keywords, prompt, summary, transcription_model, author,
|
614 |
ingestion_date):
|
|
|
615 |
# Set default values for missing fields
|
616 |
-
|
|
|
|
|
|
|
617 |
title = title or 'Untitled'
|
618 |
media_type = media_type or 'Unknown'
|
619 |
content = content or 'No content available'
|
@@ -624,10 +614,6 @@ def add_media_with_keywords(url, title, media_type, content, keywords, prompt, s
|
|
624 |
author = author or 'Unknown'
|
625 |
ingestion_date = ingestion_date or datetime.now().strftime('%Y-%m-%d')
|
626 |
|
627 |
-
# Ensure URL is valid
|
628 |
-
if not is_valid_url(url):
|
629 |
-
url = 'localhost'
|
630 |
-
|
631 |
if media_type not in ['article', 'audio', 'document', 'mediawiki_article', 'mediawiki_dump', 'obsidian_note', 'podcast', 'text', 'video', 'unknown']:
|
632 |
raise InputError("Invalid media type. Allowed types: article, audio file, document, obsidian_note podcast, text, video, unknown.")
|
633 |
|
@@ -653,68 +639,64 @@ def add_media_with_keywords(url, title, media_type, content, keywords, prompt, s
|
|
653 |
|
654 |
try:
|
655 |
with db.get_connection() as conn:
|
656 |
-
conn.execute("BEGIN TRANSACTION")
|
657 |
cursor = conn.cursor()
|
658 |
|
659 |
-
# Check if media already exists
|
660 |
-
|
661 |
-
|
662 |
-
|
663 |
-
if existing_media:
|
664 |
-
media_id = existing_media[0]
|
665 |
-
logging.info(f"Updating existing media with ID: {media_id}")
|
666 |
|
|
|
|
|
|
|
667 |
cursor.execute('''
|
668 |
UPDATE Media
|
669 |
-
SET content = ?, transcription_model = ?,
|
670 |
WHERE id = ?
|
671 |
-
''', (content, transcription_model,
|
672 |
else:
|
673 |
-
logging.
|
674 |
-
|
675 |
cursor.execute('''
|
676 |
INSERT INTO Media (url, title, type, content, author, ingestion_date, transcription_model)
|
677 |
VALUES (?, ?, ?, ?, ?, ?, ?)
|
678 |
''', (url, title, media_type, content, author, ingestion_date, transcription_model))
|
679 |
media_id = cursor.lastrowid
|
|
|
680 |
|
681 |
-
logging.info(f"Adding new modification to MediaModifications for media ID: {media_id}")
|
682 |
cursor.execute('''
|
683 |
INSERT INTO MediaModifications (media_id, prompt, summary, modification_date)
|
684 |
VALUES (?, ?, ?, ?)
|
685 |
''', (media_id, prompt, summary, ingestion_date))
|
686 |
-
logger.info("New modification added to MediaModifications")
|
687 |
|
688 |
-
#
|
689 |
-
|
690 |
-
|
691 |
-
|
692 |
-
|
693 |
-
|
694 |
-
|
695 |
-
|
696 |
-
|
|
|
|
|
|
|
697 |
|
698 |
# Update full-text search index
|
699 |
-
logging.info("Updating full-text search index")
|
700 |
cursor.execute('INSERT OR REPLACE INTO media_fts (rowid, title, content) VALUES (?, ?, ?)',
|
701 |
(media_id, title, content))
|
702 |
|
703 |
-
|
704 |
-
add_media_version(media_id, prompt, summary)
|
705 |
|
706 |
conn.commit()
|
707 |
logging.info(f"Media '{title}' successfully added/updated with ID: {media_id}")
|
708 |
|
709 |
-
|
710 |
|
711 |
except sqlite3.Error as e:
|
712 |
-
|
713 |
-
logging.error(f"SQL Error: {e}")
|
714 |
raise DatabaseError(f"Error adding media with keywords: {e}")
|
715 |
except Exception as e:
|
716 |
-
|
717 |
-
logging.error(f"Unexpected Error: {e}")
|
718 |
raise DatabaseError(f"Unexpected error: {e}")
|
719 |
|
720 |
|
@@ -866,6 +848,9 @@ def update_keywords_for_media(media_id, keyword_list):
|
|
866 |
|
867 |
#
|
868 |
# End of Keyword-related functions
|
|
|
|
|
|
|
869 |
#######################################################################################################################
|
870 |
#
|
871 |
# Media-related Functions
|
@@ -922,7 +907,10 @@ def fetch_item_details(media_id: int):
|
|
922 |
return "", "", ""
|
923 |
|
924 |
#
|
925 |
-
#
|
|
|
|
|
|
|
926 |
#######################################################################################################################
|
927 |
#
|
928 |
# Media-related Functions
|
@@ -930,21 +918,19 @@ def fetch_item_details(media_id: int):
|
|
930 |
|
931 |
|
932 |
# Function to add a version of a prompt and summary
|
933 |
-
def add_media_version(media_id: int, prompt: str, summary: str) -> None:
|
934 |
try:
|
935 |
-
|
936 |
-
cursor = conn.cursor()
|
937 |
|
938 |
-
|
939 |
-
|
940 |
-
|
941 |
|
942 |
-
|
943 |
-
|
944 |
-
|
945 |
-
|
946 |
-
|
947 |
-
conn.commit()
|
948 |
except DatabaseError as e:
|
949 |
logging.error(f"Error adding media version: {e}")
|
950 |
raise
|
@@ -1127,7 +1113,6 @@ def is_valid_date(date_string: str) -> bool:
|
|
1127 |
except ValueError:
|
1128 |
return False
|
1129 |
|
1130 |
-
|
1131 |
def add_media_to_database(url, info_dict, segments, summary, keywords, custom_prompt_input, whisper_model, media_type='video'):
|
1132 |
db = Database()
|
1133 |
try:
|
@@ -1158,16 +1143,16 @@ def add_media_to_database(url, info_dict, segments, summary, keywords, custom_pr
|
|
1158 |
media_id = existing_media[0]
|
1159 |
cursor.execute('''
|
1160 |
UPDATE Media
|
1161 |
-
SET content = ?, transcription_model = ?, title = ?, type = ?, author = ?, ingestion_date = ?
|
1162 |
WHERE id = ?
|
1163 |
''', (content, whisper_model, info_dict.get('title', 'Untitled'), media_type,
|
1164 |
-
info_dict.get('uploader', 'Unknown'), datetime.now().strftime('%Y-%m-%d'), media_id))
|
1165 |
else:
|
1166 |
cursor.execute('''
|
1167 |
-
INSERT INTO Media (url, title, type, content, author, ingestion_date, transcription_model)
|
1168 |
-
VALUES (?, ?, ?, ?, ?, ?, ?)
|
1169 |
''', (url, info_dict.get('title', 'Untitled'), media_type, content,
|
1170 |
-
info_dict.get('uploader', 'Unknown'), datetime.now().strftime('%Y-%m-%d'), whisper_model))
|
1171 |
media_id = cursor.lastrowid
|
1172 |
|
1173 |
# Add modification
|
@@ -1194,120 +1179,54 @@ def add_media_to_database(url, info_dict, segments, summary, keywords, custom_pr
|
|
1194 |
cursor.execute('''
|
1195 |
INSERT INTO MediaVersion (media_id, version, prompt, summary, created_at)
|
1196 |
VALUES (?, ?, ?, ?, ?)
|
1197 |
-
''', (
|
1198 |
-
media_id, current_version + 1, custom_prompt_input, summary, datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
|
1199 |
|
1200 |
-
|
1201 |
|
1202 |
-
|
|
|
|
|
|
|
|
|
|
|
1203 |
logging.error(f"Database error: {e}")
|
1204 |
-
raise
|
1205 |
except Exception as e:
|
1206 |
logging.error(f"Unexpected error: {e}")
|
1207 |
raise DatabaseError(f"Unexpected error: {e}")
|
1208 |
|
1209 |
-
|
1210 |
-
#
|
1211 |
-
|
1212 |
-
|
1213 |
-
|
1214 |
-
|
1215 |
-
|
1216 |
-
|
1217 |
-
|
1218 |
-
|
1219 |
-
|
1220 |
-
|
1221 |
-
|
1222 |
-
|
1223 |
-
|
1224 |
-
|
1225 |
-
|
1226 |
-
#
|
1227 |
-
|
1228 |
-
|
1229 |
-
|
1230 |
-
|
1231 |
-
|
1232 |
-
|
1233 |
-
|
1234 |
-
#
|
1235 |
-
# keyword_list = ['default']
|
1236 |
-
#
|
1237 |
-
# # Check if media already exists
|
1238 |
-
# cursor.execute('SELECT id FROM Media WHERE url = ?', (url,))
|
1239 |
-
# existing_media = cursor.fetchone()
|
1240 |
-
#
|
1241 |
-
# if existing_media:
|
1242 |
-
# media_id = existing_media[0]
|
1243 |
-
# cursor.execute('''
|
1244 |
-
# UPDATE Media
|
1245 |
-
# SET content = ?, transcription_model = ?, title = ?, type = ?, author = ?, ingestion_date = ?
|
1246 |
-
# WHERE id = ?
|
1247 |
-
# ''', (content, whisper_model, info_dict.get('title', 'Untitled'), media_type,
|
1248 |
-
# info_dict.get('uploader', 'Unknown'), datetime.now().strftime('%Y-%m-%d'), media_id))
|
1249 |
-
# else:
|
1250 |
-
# cursor.execute('''
|
1251 |
-
# INSERT INTO Media (url, title, type, content, author, ingestion_date, transcription_model)
|
1252 |
-
# VALUES (?, ?, ?, ?, ?, ?, ?)
|
1253 |
-
# ''', (url, info_dict.get('title', 'Untitled'), media_type, content,
|
1254 |
-
# info_dict.get('uploader', 'Unknown'), datetime.now().strftime('%Y-%m-%d'), whisper_model))
|
1255 |
-
# media_id = cursor.lastrowid
|
1256 |
-
#
|
1257 |
-
# # Add modification
|
1258 |
-
# cursor.execute('''
|
1259 |
-
# INSERT INTO MediaModifications (media_id, prompt, summary, modification_date)
|
1260 |
-
# VALUES (?, ?, ?, ?)
|
1261 |
-
# ''', (media_id, custom_prompt_input, summary, datetime.now().strftime('%Y-%m-%d')))
|
1262 |
-
#
|
1263 |
-
# # Process keywords
|
1264 |
-
# for keyword in keyword_list:
|
1265 |
-
# cursor.execute('INSERT OR IGNORE INTO Keywords (keyword) VALUES (?)', (keyword,))
|
1266 |
-
# cursor.execute('SELECT id FROM Keywords WHERE keyword = ?', (keyword,))
|
1267 |
-
# keyword_id = cursor.fetchone()[0]
|
1268 |
-
# cursor.execute('INSERT OR IGNORE INTO MediaKeywords (media_id, keyword_id) VALUES (?, ?)',
|
1269 |
-
# (media_id, keyword_id))
|
1270 |
-
#
|
1271 |
-
# # Update full-text search index
|
1272 |
-
# cursor.execute('INSERT OR REPLACE INTO media_fts (rowid, title, content) VALUES (?, ?, ?)',
|
1273 |
-
# (media_id, info_dict.get('title', 'Untitled'), content))
|
1274 |
-
#
|
1275 |
-
# # Add media version
|
1276 |
-
# cursor.execute('SELECT MAX(version) FROM MediaVersion WHERE media_id = ?', (media_id,))
|
1277 |
-
# current_version = cursor.fetchone()[0] or 0
|
1278 |
-
# cursor.execute('''
|
1279 |
-
# INSERT INTO MediaVersion (media_id, version, prompt, summary, created_at)
|
1280 |
-
# VALUES (?, ?, ?, ?, ?)
|
1281 |
-
# ''', (media_id, current_version + 1, custom_prompt_input, summary, datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
|
1282 |
-
#
|
1283 |
-
# # Create initial document version
|
1284 |
-
# create_document_version(media_id, content)
|
1285 |
-
# # Commit the transaction
|
1286 |
-
# conn.commit()
|
1287 |
-
# logging.info(
|
1288 |
-
# f"Media '{info_dict.get('title', 'Untitled')}' successfully added/updated with ID: {media_id}")
|
1289 |
-
# return f"Media '{info_dict.get('title', 'Untitled')}' added/updated successfully with keywords: {', '.join(keyword_list)}"
|
1290 |
-
#
|
1291 |
-
# except Exception as e:
|
1292 |
-
# conn.rollback()
|
1293 |
-
# raise e
|
1294 |
-
#
|
1295 |
-
# except sqlite3.OperationalError as e:
|
1296 |
-
# if 'database is locked' in str(e) and attempt < max_retries - 1:
|
1297 |
-
# delay = base_delay * (2 ** attempt)
|
1298 |
-
# logging.warning(f"Database is locked, retrying in {delay:.2f} seconds...")
|
1299 |
-
# time.sleep(delay)
|
1300 |
-
# else:
|
1301 |
-
# logging.error(f"Database error after {attempt + 1} attempts: {e}")
|
1302 |
-
# raise DatabaseError(f"Error adding media with keywords: {e}")
|
1303 |
-
# except Exception as e:
|
1304 |
-
# logging.error(f"Unexpected error: {e}")
|
1305 |
-
# raise DatabaseError(f"Unexpected error: {e}")
|
1306 |
-
#
|
1307 |
-
# raise DatabaseError("Failed to add media to database after multiple attempts")
|
1308 |
|
1309 |
#
|
1310 |
# End of ....
|
|
|
|
|
|
|
1311 |
#######################################################################################################################
|
1312 |
#
|
1313 |
# Functions to manage prompts DB
|
@@ -1570,6 +1489,9 @@ def delete_prompt(prompt_id):
|
|
1570 |
|
1571 |
#
|
1572 |
#
|
|
|
|
|
|
|
1573 |
#######################################################################################################################
|
1574 |
#
|
1575 |
# Function to fetch/update media content
|
@@ -1834,6 +1756,9 @@ def search_and_display_items(query, search_type, page, entries_per_page,char_cou
|
|
1834 |
|
1835 |
#
|
1836 |
# End of Functions to manage prompts DB / Fetch and update media content
|
|
|
|
|
|
|
1837 |
#######################################################################################################################
|
1838 |
#
|
1839 |
# Obsidian-related Functions
|
@@ -1899,6 +1824,9 @@ def import_obsidian_note_to_db(note_data):
|
|
1899 |
|
1900 |
#
|
1901 |
# End of Obsidian-related Functions
|
|
|
|
|
|
|
1902 |
#######################################################################################################################
|
1903 |
#
|
1904 |
# Chat-related Functions
|
@@ -2501,6 +2429,9 @@ def get_paginated_files(page: int = 1, results_per_page: int = 50) -> Tuple[List
|
|
2501 |
|
2502 |
#
|
2503 |
# End of Functions to handle deletion of media items
|
|
|
|
|
|
|
2504 |
#######################################################################################################################
|
2505 |
#
|
2506 |
# Functions to manage document versions
|
@@ -2572,4 +2503,153 @@ def get_document_version(media_id: int, version_number: int = None) -> Dict[str,
|
|
2572 |
|
2573 |
#
|
2574 |
# End of Functions to manage document versions
|
2575 |
-
#######################################################################################################################
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
49 |
import html
|
50 |
import logging
|
51 |
import os
|
52 |
+
import queue
|
53 |
import re
|
54 |
import shutil
|
55 |
import sqlite3
|
|
|
56 |
import traceback
|
|
|
57 |
from datetime import datetime, timedelta
|
58 |
+
from typing import List, Tuple, Dict, Any, Optional
|
59 |
# Local Libraries
|
60 |
+
from App_Function_Libraries.Utils.Utils import get_project_relative_path, get_database_path, \
|
61 |
+
get_database_dir
|
62 |
+
from App_Function_Libraries.Chunk_Lib import chunk_options, chunk_text
|
63 |
+
#
|
64 |
# Third-Party Libraries
|
65 |
import gradio as gr
|
66 |
import pandas as pd
|
67 |
import yaml
|
|
|
|
|
|
|
68 |
#
|
69 |
#######################################################################################################################
|
70 |
# Function Definitions
|
|
|
76 |
ensure_database_directory()
|
77 |
|
78 |
# Set up logging
|
|
|
|
|
79 |
logger = logging.getLogger(__name__)
|
80 |
|
81 |
# FIXME - Setup properly and test/add documentation for its existence...
|
|
|
113 |
|
114 |
#
|
115 |
#
|
116 |
+
#######################################################################################################################
|
117 |
+
|
118 |
+
|
119 |
#######################################################################################################################
|
120 |
#
|
121 |
# Backup-related functions
|
|
|
179 |
|
180 |
#
|
181 |
#
|
182 |
+
#######################################################################################################################
|
183 |
+
|
184 |
+
|
185 |
#######################################################################################################################
|
186 |
#
|
187 |
# DB-Integrity Check Functions
|
|
|
206 |
|
207 |
#
|
208 |
# End of DB-Integrity Check functions
|
209 |
+
#######################################################################################################################
|
210 |
+
|
211 |
+
|
212 |
#######################################################################################################################
|
213 |
#
|
214 |
+
# DB Setup Functions
|
215 |
|
|
|
216 |
class DatabaseError(Exception):
|
217 |
pass
|
218 |
|
|
|
219 |
class InputError(Exception):
|
220 |
pass
|
221 |
|
|
|
|
|
|
|
222 |
class Database:
|
223 |
+
def __init__(self, db_name='media_summary.db'):
|
224 |
+
self.db_path = get_database_path(db_name)
|
225 |
+
self.timeout = 60.0 # 60 seconds timeout
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
226 |
|
227 |
+
def get_connection(self):
|
228 |
+
return sqlite3.connect(self.db_path, timeout=self.timeout)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
229 |
|
230 |
def execute_query(self, query: str, params: Tuple = ()) -> None:
|
231 |
with self.get_connection() as conn:
|
232 |
+
try:
|
233 |
+
cursor = conn.cursor()
|
234 |
+
cursor.execute(query, params)
|
235 |
+
conn.commit()
|
236 |
+
except sqlite3.Error as e:
|
237 |
+
logging.error(f"Database error: {e}, Query: {query}")
|
238 |
+
raise DatabaseError(f"Database error: {e}, Query: {query}")
|
|
|
239 |
|
240 |
+
def execute_many(self, query: str, params_list: List[Tuple]) -> None:
|
241 |
+
with self.get_connection() as conn:
|
242 |
+
try:
|
243 |
+
cursor = conn.cursor()
|
244 |
+
cursor.executemany(query, params_list)
|
245 |
+
conn.commit()
|
246 |
+
except sqlite3.Error as e:
|
247 |
+
logging.error(f"Database error: {e}, Query: {query}")
|
248 |
+
raise DatabaseError(f"Database error: {e}, Query: {query}")
|
249 |
|
250 |
db = Database()
|
251 |
|
252 |
+
def instantiate_sqlite_db():
|
253 |
global sqlite_db
|
254 |
sqlite_db = Database()
|
255 |
|
|
|
272 |
transcription_model TEXT,
|
273 |
is_trash BOOLEAN DEFAULT 0,
|
274 |
trash_date DATETIME,
|
275 |
+
vector_embedding BLOB,
|
276 |
+
chunking_status TEXT DEFAULT 'pending'
|
277 |
)
|
278 |
''',
|
279 |
'''
|
|
|
427 |
create_tables(db)
|
428 |
|
429 |
|
430 |
+
def check_media_exists(title: str, url: str) -> Optional[int]:
|
431 |
+
try:
|
432 |
+
with db.get_connection() as conn:
|
433 |
+
cursor = conn.cursor()
|
434 |
+
query = 'SELECT id FROM Media WHERE title = ? AND url = ?'
|
435 |
+
cursor.execute(query, (title, url))
|
436 |
+
result = cursor.fetchone()
|
437 |
+
logging.debug(f"check_media_exists query: {query}")
|
438 |
+
logging.debug(f"check_media_exists params: title={title}, url={url}")
|
439 |
+
logging.debug(f"check_media_exists result: {result}")
|
440 |
+
return result[0] if result else None
|
441 |
+
except Exception as e:
|
442 |
+
logging.error(f"Error checking if media exists: {str(e)}")
|
443 |
+
logging.error(f"Exception details: {traceback.format_exc()}")
|
444 |
+
return None
|
445 |
|
446 |
|
447 |
def check_media_and_whisper_model(title=None, url=None, current_whisper_model=None):
|
|
|
513 |
return False, f"Media found with same whisper model (ID: {media_id})"
|
514 |
|
515 |
|
516 |
+
def add_media_chunk(media_id: int, chunk_text: str, start_index: int, end_index: int, chunk_id: str):
|
517 |
with db.get_connection() as conn:
|
518 |
cursor = conn.cursor()
|
519 |
cursor.execute(
|
|
|
535 |
cursor.execute("SELECT id, content, type FROM Media WHERE id NOT IN (SELECT DISTINCT media_id FROM MediaChunks)")
|
536 |
return cursor.fetchall()
|
537 |
|
538 |
+
def get_next_media_id():
|
539 |
+
try:
|
540 |
+
conn = sqlite3.connect(db_path)
|
541 |
+
cursor = conn.cursor()
|
542 |
+
cursor.execute("SELECT MAX(media_id) FROM media")
|
543 |
+
max_id = cursor.fetchone()[0]
|
544 |
+
return (max_id or 0) + 1
|
545 |
+
finally:
|
546 |
+
conn.close()
|
547 |
+
|
548 |
#
|
549 |
# End of Media-related Functions
|
550 |
+
#######################################################################################################################
|
551 |
+
|
552 |
+
|
553 |
#######################################################################################################################
|
554 |
# Keyword-related Functions
|
555 |
#
|
|
|
598 |
# Function to add media with keywords
|
599 |
def add_media_with_keywords(url, title, media_type, content, keywords, prompt, summary, transcription_model, author,
|
600 |
ingestion_date):
|
601 |
+
logging.debug(f"Entering add_media_with_keywords: URL={url}, Title={title}")
|
602 |
# Set default values for missing fields
|
603 |
+
if url is None:
|
604 |
+
url = 'localhost'
|
605 |
+
elif url is not None:
|
606 |
+
url = url
|
607 |
title = title or 'Untitled'
|
608 |
media_type = media_type or 'Unknown'
|
609 |
content = content or 'No content available'
|
|
|
614 |
author = author or 'Unknown'
|
615 |
ingestion_date = ingestion_date or datetime.now().strftime('%Y-%m-%d')
|
616 |
|
|
|
|
|
|
|
|
|
617 |
if media_type not in ['article', 'audio', 'document', 'mediawiki_article', 'mediawiki_dump', 'obsidian_note', 'podcast', 'text', 'video', 'unknown']:
|
618 |
raise InputError("Invalid media type. Allowed types: article, audio file, document, obsidian_note podcast, text, video, unknown.")
|
619 |
|
|
|
639 |
|
640 |
try:
|
641 |
with db.get_connection() as conn:
|
|
|
642 |
cursor = conn.cursor()
|
643 |
|
644 |
+
# Check if media already exists using both title and URL
|
645 |
+
existing_media_id = check_media_exists(title, url)
|
646 |
+
logging.debug(f"Existing media ID for {url}: {existing_media_id}")
|
|
|
|
|
|
|
|
|
647 |
|
648 |
+
if existing_media_id:
|
649 |
+
media_id = existing_media_id
|
650 |
+
logging.debug(f"Updating existing media with ID: {media_id}")
|
651 |
cursor.execute('''
|
652 |
UPDATE Media
|
653 |
+
SET content = ?, transcription_model = ?, type = ?, author = ?, ingestion_date = ?
|
654 |
WHERE id = ?
|
655 |
+
''', (content, transcription_model, media_type, author, ingestion_date, media_id))
|
656 |
else:
|
657 |
+
logging.debug("Inserting new media")
|
|
|
658 |
cursor.execute('''
|
659 |
INSERT INTO Media (url, title, type, content, author, ingestion_date, transcription_model)
|
660 |
VALUES (?, ?, ?, ?, ?, ?, ?)
|
661 |
''', (url, title, media_type, content, author, ingestion_date, transcription_model))
|
662 |
media_id = cursor.lastrowid
|
663 |
+
logging.debug(f"New media inserted with ID: {media_id}")
|
664 |
|
|
|
665 |
cursor.execute('''
|
666 |
INSERT INTO MediaModifications (media_id, prompt, summary, modification_date)
|
667 |
VALUES (?, ?, ?, ?)
|
668 |
''', (media_id, prompt, summary, ingestion_date))
|
|
|
669 |
|
670 |
+
# Batch insert keywords
|
671 |
+
keyword_params = [(keyword.strip().lower(),) for keyword in keyword_list]
|
672 |
+
cursor.executemany('INSERT OR IGNORE INTO Keywords (keyword) VALUES (?)', keyword_params)
|
673 |
+
|
674 |
+
# Get keyword IDs
|
675 |
+
placeholder = ','.join(['?'] * len(keyword_list))
|
676 |
+
cursor.execute(f'SELECT id, keyword FROM Keywords WHERE keyword IN ({placeholder})', keyword_list)
|
677 |
+
keyword_ids = cursor.fetchall()
|
678 |
+
|
679 |
+
# Batch insert media-keyword associations
|
680 |
+
media_keyword_params = [(media_id, keyword_id) for keyword_id, _ in keyword_ids]
|
681 |
+
cursor.executemany('INSERT OR IGNORE INTO MediaKeywords (media_id, keyword_id) VALUES (?, ?)', media_keyword_params)
|
682 |
|
683 |
# Update full-text search index
|
|
|
684 |
cursor.execute('INSERT OR REPLACE INTO media_fts (rowid, title, content) VALUES (?, ?, ?)',
|
685 |
(media_id, title, content))
|
686 |
|
687 |
+
# Add media version
|
688 |
+
add_media_version(conn, media_id, prompt, summary)
|
689 |
|
690 |
conn.commit()
|
691 |
logging.info(f"Media '{title}' successfully added/updated with ID: {media_id}")
|
692 |
|
693 |
+
return media_id, f"Media '{title}' added/updated successfully with keywords: {', '.join(keyword_list)}"
|
694 |
|
695 |
except sqlite3.Error as e:
|
696 |
+
logging.error(f"SQL Error in add_media_with_keywords: {e}")
|
|
|
697 |
raise DatabaseError(f"Error adding media with keywords: {e}")
|
698 |
except Exception as e:
|
699 |
+
logging.error(f"Unexpected Error in add_media_with_keywords: {e}")
|
|
|
700 |
raise DatabaseError(f"Unexpected error: {e}")
|
701 |
|
702 |
|
|
|
848 |
|
849 |
#
|
850 |
# End of Keyword-related functions
|
851 |
+
#######################################################################################################################
|
852 |
+
|
853 |
+
|
854 |
#######################################################################################################################
|
855 |
#
|
856 |
# Media-related Functions
|
|
|
907 |
return "", "", ""
|
908 |
|
909 |
#
|
910 |
+
# End of Media-related Functions
|
911 |
+
#######################################################################################################################
|
912 |
+
|
913 |
+
|
914 |
#######################################################################################################################
|
915 |
#
|
916 |
# Media-related Functions
|
|
|
918 |
|
919 |
|
920 |
# Function to add a version of a prompt and summary
|
921 |
+
def add_media_version(conn, media_id: int, prompt: str, summary: str) -> None:
|
922 |
try:
|
923 |
+
cursor = conn.cursor()
|
|
|
924 |
|
925 |
+
# Get the current version number
|
926 |
+
cursor.execute('SELECT MAX(version) FROM MediaVersion WHERE media_id = ?', (media_id,))
|
927 |
+
current_version = cursor.fetchone()[0] or 0
|
928 |
|
929 |
+
# Insert the new version
|
930 |
+
cursor.execute('''
|
931 |
+
INSERT INTO MediaVersion (media_id, version, prompt, summary, created_at)
|
932 |
+
VALUES (?, ?, ?, ?, ?)
|
933 |
+
''', (media_id, current_version + 1, prompt, summary, datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
|
|
|
934 |
except DatabaseError as e:
|
935 |
logging.error(f"Error adding media version: {e}")
|
936 |
raise
|
|
|
1113 |
except ValueError:
|
1114 |
return False
|
1115 |
|
|
|
1116 |
def add_media_to_database(url, info_dict, segments, summary, keywords, custom_prompt_input, whisper_model, media_type='video'):
|
1117 |
db = Database()
|
1118 |
try:
|
|
|
1143 |
media_id = existing_media[0]
|
1144 |
cursor.execute('''
|
1145 |
UPDATE Media
|
1146 |
+
SET content = ?, transcription_model = ?, title = ?, type = ?, author = ?, ingestion_date = ?, chunking_status = ?
|
1147 |
WHERE id = ?
|
1148 |
''', (content, whisper_model, info_dict.get('title', 'Untitled'), media_type,
|
1149 |
+
info_dict.get('uploader', 'Unknown'), datetime.now().strftime('%Y-%m-%d'), 'pending', media_id))
|
1150 |
else:
|
1151 |
cursor.execute('''
|
1152 |
+
INSERT INTO Media (url, title, type, content, author, ingestion_date, transcription_model, chunking_status)
|
1153 |
+
VALUES (?, ?, ?, ?, ?, ?, ?, ?)
|
1154 |
''', (url, info_dict.get('title', 'Untitled'), media_type, content,
|
1155 |
+
info_dict.get('uploader', 'Unknown'), datetime.now().strftime('%Y-%m-%d'), whisper_model, 'pending'))
|
1156 |
media_id = cursor.lastrowid
|
1157 |
|
1158 |
# Add modification
|
|
|
1179 |
cursor.execute('''
|
1180 |
INSERT INTO MediaVersion (media_id, version, prompt, summary, created_at)
|
1181 |
VALUES (?, ?, ?, ?, ?)
|
1182 |
+
''', (media_id, current_version + 1, custom_prompt_input, summary, datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
|
|
|
1183 |
|
1184 |
+
conn.commit()
|
1185 |
|
1186 |
+
# Schedule chunking
|
1187 |
+
schedule_chunking(media_id, content, info_dict.get('title', 'Untitled'))
|
1188 |
+
|
1189 |
+
return f"Media '{info_dict.get('title', 'Untitled')}' added/updated successfully with keywords: {', '.join(keyword_list)}. Chunking scheduled."
|
1190 |
+
|
1191 |
+
except DatabaseError as e:
|
1192 |
logging.error(f"Database error: {e}")
|
1193 |
+
raise
|
1194 |
except Exception as e:
|
1195 |
logging.error(f"Unexpected error: {e}")
|
1196 |
raise DatabaseError(f"Unexpected error: {e}")
|
1197 |
|
1198 |
+
|
1199 |
+
# FIXME: This function is not complete and needs to be implemented
|
1200 |
+
def schedule_chunking(media_id: int, content: str, media_name: str):
|
1201 |
+
try:
|
1202 |
+
chunks = chunk_text(content, chunk_options['method'], chunk_options['max_size'], chunk_options['overlap'])
|
1203 |
+
db = Database()
|
1204 |
+
with db.get_connection() as conn:
|
1205 |
+
cursor = conn.cursor()
|
1206 |
+
for i, chunk in enumerate(chunks):
|
1207 |
+
cursor.execute('''
|
1208 |
+
INSERT INTO MediaChunks (media_id, chunk_text, start_index, end_index, chunk_id)
|
1209 |
+
VALUES (?, ?, ?, ?, ?)
|
1210 |
+
''', (media_id, chunk, i * chunk_options['max_size'],
|
1211 |
+
min((i + 1) * chunk_options['max_size'], len(content)),
|
1212 |
+
f"{media_id}_chunk_{i}"))
|
1213 |
+
conn.commit()
|
1214 |
+
|
1215 |
+
# Update chunking status
|
1216 |
+
with db.get_connection() as conn:
|
1217 |
+
cursor = conn.cursor()
|
1218 |
+
cursor.execute("UPDATE Media SET chunking_status = 'completed' WHERE id = ?", (media_id,))
|
1219 |
+
conn.commit()
|
1220 |
+
|
1221 |
+
except Exception as e:
|
1222 |
+
logging.error(f"Error scheduling chunking for media_id {media_id}: {str(e)}")
|
1223 |
+
# You might want to update the chunking_status to 'failed' here
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1224 |
|
1225 |
#
|
1226 |
# End of ....
|
1227 |
+
#######################################################################################################################
|
1228 |
+
|
1229 |
+
|
1230 |
#######################################################################################################################
|
1231 |
#
|
1232 |
# Functions to manage prompts DB
|
|
|
1489 |
|
1490 |
#
|
1491 |
#
|
1492 |
+
#######################################################################################################################
|
1493 |
+
|
1494 |
+
|
1495 |
#######################################################################################################################
|
1496 |
#
|
1497 |
# Function to fetch/update media content
|
|
|
1756 |
|
1757 |
#
|
1758 |
# End of Functions to manage prompts DB / Fetch and update media content
|
1759 |
+
#######################################################################################################################
|
1760 |
+
|
1761 |
+
|
1762 |
#######################################################################################################################
|
1763 |
#
|
1764 |
# Obsidian-related Functions
|
|
|
1824 |
|
1825 |
#
|
1826 |
# End of Obsidian-related Functions
|
1827 |
+
#######################################################################################################################
|
1828 |
+
|
1829 |
+
|
1830 |
#######################################################################################################################
|
1831 |
#
|
1832 |
# Chat-related Functions
|
|
|
2429 |
|
2430 |
#
|
2431 |
# End of Functions to handle deletion of media items
|
2432 |
+
#######################################################################################################################
|
2433 |
+
|
2434 |
+
|
2435 |
#######################################################################################################################
|
2436 |
#
|
2437 |
# Functions to manage document versions
|
|
|
2503 |
|
2504 |
#
|
2505 |
# End of Functions to manage document versions
|
2506 |
+
#######################################################################################################################
|
2507 |
+
|
2508 |
+
|
2509 |
+
#######################################################################################################################
|
2510 |
+
#
|
2511 |
+
# Functions to manage media chunks
|
2512 |
+
|
2513 |
+
def process_chunks(database, chunks: List[Dict], media_id: int, batch_size: int = 100):
|
2514 |
+
"""
|
2515 |
+
Process chunks in batches and insert them into the database.
|
2516 |
+
|
2517 |
+
:param database: Database instance to use for inserting chunks
|
2518 |
+
:param chunks: List of chunk dictionaries
|
2519 |
+
:param media_id: ID of the media these chunks belong to
|
2520 |
+
:param batch_size: Number of chunks to process in each batch
|
2521 |
+
"""
|
2522 |
+
total_chunks = len(chunks)
|
2523 |
+
processed_chunks = 0
|
2524 |
+
|
2525 |
+
for i in range(0, total_chunks, batch_size):
|
2526 |
+
batch = chunks[i:i + batch_size]
|
2527 |
+
chunk_data = [
|
2528 |
+
(media_id, chunk['text'], chunk['start_index'], chunk['end_index'])
|
2529 |
+
for chunk in batch
|
2530 |
+
]
|
2531 |
+
|
2532 |
+
try:
|
2533 |
+
database.execute_many(
|
2534 |
+
"INSERT INTO MediaChunks (media_id, chunk_text, start_index, end_index) VALUES (?, ?, ?, ?)",
|
2535 |
+
chunk_data
|
2536 |
+
)
|
2537 |
+
processed_chunks += len(batch)
|
2538 |
+
logging.info(f"Processed {processed_chunks}/{total_chunks} chunks for media_id {media_id}")
|
2539 |
+
except Exception as e:
|
2540 |
+
logging.error(f"Error inserting chunk batch for media_id {media_id}: {e}")
|
2541 |
+
# Optionally, you could raise an exception here to stop processing
|
2542 |
+
# raise
|
2543 |
+
|
2544 |
+
logging.info(f"Finished processing all {total_chunks} chunks for media_id {media_id}")
|
2545 |
+
|
2546 |
+
|
2547 |
+
# Usage example:
|
2548 |
+
# chunks = [{'text': 'chunk1', 'start_index': 0, 'end_index': 10}, ...]
|
2549 |
+
# process_chunks(db, chunks, media_id=1, batch_size=100)
|
2550 |
+
|
2551 |
+
def batch_insert_chunks(conn, chunks, media_id):
|
2552 |
+
cursor = conn.cursor()
|
2553 |
+
chunk_data = [(
|
2554 |
+
media_id,
|
2555 |
+
chunk['text'],
|
2556 |
+
chunk['metadata']['start_index'],
|
2557 |
+
chunk['metadata']['end_index'],
|
2558 |
+
f"{media_id}_chunk_{i}"
|
2559 |
+
) for i, chunk in enumerate(chunks, 1)]
|
2560 |
+
|
2561 |
+
cursor.executemany('''
|
2562 |
+
INSERT INTO MediaChunks (media_id, chunk_text, start_index, end_index, chunk_id)
|
2563 |
+
VALUES (?, ?, ?, ?, ?)
|
2564 |
+
''', chunk_data)
|
2565 |
+
|
2566 |
+
|
2567 |
+
chunk_queue = queue.Queue()
|
2568 |
+
|
2569 |
+
def chunk_processor():
|
2570 |
+
while True:
|
2571 |
+
chunk_batch = chunk_queue.get()
|
2572 |
+
if chunk_batch is None:
|
2573 |
+
break
|
2574 |
+
try:
|
2575 |
+
with db.get_connection() as conn:
|
2576 |
+
conn.execute("BEGIN TRANSACTION")
|
2577 |
+
try:
|
2578 |
+
batch_insert_chunks(conn, chunk_batch['chunks'], chunk_batch['media_id'])
|
2579 |
+
conn.commit()
|
2580 |
+
except Exception as e:
|
2581 |
+
conn.rollback()
|
2582 |
+
logging.error(f"Error in batch insert: {str(e)}")
|
2583 |
+
except Exception as e:
|
2584 |
+
logging.error(f"Error processing chunk batch: {str(e)}")
|
2585 |
+
finally:
|
2586 |
+
chunk_queue.task_done()
|
2587 |
+
|
2588 |
+
# Start the chunk processor thread
|
2589 |
+
#chunk_processor_thread = threading.Thread(target=chunk_processor)
|
2590 |
+
#chunk_processor_thread.start()
|
2591 |
+
|
2592 |
+
# Make sure to properly shut down the chunk processor when your application exits
|
2593 |
+
# def shutdown_chunk_processor():
|
2594 |
+
# chunk_queue.put(None)
|
2595 |
+
# chunk_processor_thread.join()
|
2596 |
+
|
2597 |
+
#FIXME - add into main db creation code
|
2598 |
+
def update_media_chunks_table():
|
2599 |
+
with db.get_connection() as conn:
|
2600 |
+
cursor = conn.cursor()
|
2601 |
+
cursor.execute('''
|
2602 |
+
CREATE TABLE IF NOT EXISTS MediaChunks_new (
|
2603 |
+
id INTEGER PRIMARY KEY AUTOINCREMENT,
|
2604 |
+
media_id INTEGER,
|
2605 |
+
chunk_text TEXT,
|
2606 |
+
start_index INTEGER,
|
2607 |
+
end_index INTEGER,
|
2608 |
+
chunk_id TEXT,
|
2609 |
+
FOREIGN KEY (media_id) REFERENCES Media(id)
|
2610 |
+
)
|
2611 |
+
''')
|
2612 |
+
cursor.execute('''
|
2613 |
+
INSERT INTO MediaChunks_new (media_id, chunk_text, start_index, end_index)
|
2614 |
+
SELECT media_id, chunk_text, start_index, end_index FROM MediaChunks
|
2615 |
+
''')
|
2616 |
+
cursor.execute('DROP TABLE MediaChunks')
|
2617 |
+
cursor.execute('ALTER TABLE MediaChunks_new RENAME TO MediaChunks')
|
2618 |
+
|
2619 |
+
logger.info("Updated MediaChunks table schema")
|
2620 |
+
|
2621 |
+
update_media_chunks_table()
|
2622 |
+
# Above function is a dirty hack that should be merged into the initial DB creation statement. This is a placeholder
|
2623 |
+
# FIXME
|
2624 |
+
|
2625 |
+
|
2626 |
+
# This is backwards compatibility for older setups.
|
2627 |
+
# Function to add a missing column to the Media table
|
2628 |
+
def add_missing_column_if_not_exists(db, table_name, column_name, column_definition):
|
2629 |
+
try:
|
2630 |
+
# Check if the column already exists in the table
|
2631 |
+
cursor = db.cursor()
|
2632 |
+
cursor.execute(f"PRAGMA table_info({table_name})")
|
2633 |
+
columns = [column[1] for column in cursor.fetchall()]
|
2634 |
+
|
2635 |
+
# If the column is not found, add it
|
2636 |
+
if column_name not in columns:
|
2637 |
+
logging.info(f"Adding missing column '{column_name}' to table '{table_name}'")
|
2638 |
+
cursor.execute(f"ALTER TABLE {table_name} ADD COLUMN {column_definition}")
|
2639 |
+
db.commit()
|
2640 |
+
logging.info(f"Column '{column_name}' added successfully.")
|
2641 |
+
else:
|
2642 |
+
logging.info(f"Column '{column_name}' already exists in table '{table_name}'")
|
2643 |
+
|
2644 |
+
except sqlite3.Error as e:
|
2645 |
+
logging.error(f"Error checking or adding column '{column_name}' in table '{table_name}': {e}")
|
2646 |
+
raise
|
2647 |
+
|
2648 |
+
# Example usage of the function
|
2649 |
+
def update_media_table(db):
|
2650 |
+
# Add chunking_status column if it doesn't exist
|
2651 |
+
add_missing_column_if_not_exists(db, 'Media', 'chunking_status', "TEXT DEFAULT 'pending'")
|
2652 |
+
|
2653 |
+
#
|
2654 |
+
# End of Functions to manage media chunks
|
2655 |
+
#######################################################################################################################
|