oceansweep commited on
Commit
4a72223
1 Parent(s): 8e7296e

SQLite_DB.py

Browse files
Files changed (1) hide show
  1. SQLite_DB.py +430 -0
SQLite_DB.py ADDED
@@ -0,0 +1,430 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sqlite3
2
+ import gradio as gr
3
+ import pandas as pd
4
+ import logging
5
+ import os
6
+ import re
7
+ import time
8
+ from typing import List, Tuple, Union
9
+ from contextlib import contextmanager
10
+ from urllib.parse import urlparse
11
+ from datetime import datetime
12
+
13
+ # Set up logging
14
+ logging.basicConfig(level=logging.INFO)
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ # Custom exceptions
19
+ class DatabaseError(Exception):
20
+ pass
21
+
22
+
23
+ class InputError(Exception):
24
+ pass
25
+
26
+
27
+ # Database connection function with connection pooling
28
+ class Database:
29
+ def __init__(self, db_name=None):
30
+ self.db_name = db_name or os.getenv('DB_NAME', 'media_summary.db')
31
+ self.pool = []
32
+ self.pool_size = 10
33
+
34
+ @contextmanager
35
+ def get_connection(self):
36
+ retry_count = 5
37
+ retry_delay = 1
38
+ conn = None
39
+ while retry_count > 0:
40
+ try:
41
+ conn = self.pool.pop() if self.pool else sqlite3.connect(self.db_name, check_same_thread=False)
42
+ yield conn
43
+ self.pool.append(conn)
44
+ return
45
+ except sqlite3.OperationalError as e:
46
+ if 'database is locked' in str(e):
47
+ logger.warning(f"Database is locked, retrying in {retry_delay} seconds...")
48
+ retry_count -= 1
49
+ time.sleep(retry_delay)
50
+ else:
51
+ raise DatabaseError(f"Database error: {e}")
52
+ except Exception as e:
53
+ raise DatabaseError(f"Unexpected error: {e}")
54
+ finally:
55
+ # Ensure the connection is returned to the pool even on failure
56
+ if conn:
57
+ self.pool.append(conn)
58
+ raise DatabaseError("Database is locked and retries have been exhausted")
59
+
60
+ def execute_query(self, query: str, params: Tuple = ()) -> None:
61
+ with self.get_connection() as conn:
62
+ try:
63
+ cursor = conn.cursor()
64
+ cursor.execute(query, params)
65
+ conn.commit()
66
+ except sqlite3.Error as e:
67
+ raise DatabaseError(f"Database error: {e}, Query: {query}")
68
+
69
+ db = Database()
70
+
71
+
72
+ # Function to create tables with the new media schema
73
+ def create_tables() -> None:
74
+ table_queries = [
75
+ '''
76
+ CREATE TABLE IF NOT EXISTS Media (
77
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
78
+ url TEXT,
79
+ title TEXT NOT NULL,
80
+ type TEXT NOT NULL,
81
+ content TEXT,
82
+ author TEXT,
83
+ ingestion_date TEXT,
84
+ prompt TEXT,
85
+ summary TEXT,
86
+ transcription_model TEXT
87
+ )
88
+ ''',
89
+ '''
90
+ CREATE TABLE IF NOT EXISTS Keywords (
91
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
92
+ keyword TEXT NOT NULL UNIQUE
93
+ )
94
+ ''',
95
+ '''
96
+ CREATE TABLE IF NOT EXISTS MediaKeywords (
97
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
98
+ media_id INTEGER NOT NULL,
99
+ keyword_id INTEGER NOT NULL,
100
+ FOREIGN KEY (media_id) REFERENCES Media(id),
101
+ FOREIGN KEY (keyword_id) REFERENCES Keywords(id)
102
+ )
103
+ ''',
104
+ '''
105
+ CREATE TABLE IF NOT EXISTS MediaVersion (
106
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
107
+ media_id INTEGER NOT NULL,
108
+ version INTEGER NOT NULL,
109
+ prompt TEXT,
110
+ summary TEXT,
111
+ created_at TEXT NOT NULL,
112
+ FOREIGN KEY (media_id) REFERENCES Media(id)
113
+ )
114
+ ''',
115
+ '''
116
+ CREATE TABLE IF NOT EXISTS MediaModifications (
117
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
118
+ media_id INTEGER NOT NULL,
119
+ prompt TEXT,
120
+ summary TEXT,
121
+ modification_date TEXT,
122
+ FOREIGN KEY (media_id) REFERENCES Media(id)
123
+ )
124
+ ''',
125
+ '''
126
+ CREATE VIRTUAL TABLE IF NOT EXISTS media_fts USING fts5(title, content);
127
+ ''',
128
+ '''
129
+ CREATE VIRTUAL TABLE IF NOT EXISTS keyword_fts USING fts5(keyword);
130
+ ''',
131
+ '''
132
+ CREATE INDEX IF NOT EXISTS idx_media_title ON Media(title);
133
+ ''',
134
+ '''
135
+ CREATE INDEX IF NOT EXISTS idx_media_type ON Media(type);
136
+ ''',
137
+ '''
138
+ CREATE INDEX IF NOT EXISTS idx_media_author ON Media(author);
139
+ ''',
140
+ '''
141
+ CREATE INDEX IF NOT EXISTS idx_media_ingestion_date ON Media(ingestion_date);
142
+ ''',
143
+ '''
144
+ CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON Keywords(keyword);
145
+ ''',
146
+ '''
147
+ CREATE INDEX IF NOT EXISTS idx_mediakeywords_media_id ON MediaKeywords(media_id);
148
+ ''',
149
+ '''
150
+ CREATE INDEX IF NOT EXISTS idx_mediakeywords_keyword_id ON MediaKeywords(keyword_id);
151
+ ''',
152
+ '''
153
+ CREATE INDEX IF NOT EXISTS idx_media_version_media_id ON MediaVersion(media_id);
154
+ '''
155
+ ]
156
+ for query in table_queries:
157
+ db.execute_query(query)
158
+
159
+ create_tables()
160
+
161
+
162
+ # Function to add a keyword
163
+ def add_keyword(keyword: str) -> int:
164
+ keyword = keyword.strip().lower()
165
+ with db.get_connection() as conn:
166
+ cursor = conn.cursor()
167
+ try:
168
+ cursor.execute('INSERT OR IGNORE INTO Keywords (keyword) VALUES (?)', (keyword,))
169
+ cursor.execute('SELECT id FROM Keywords WHERE keyword = ?', (keyword,))
170
+ keyword_id = cursor.fetchone()[0]
171
+ cursor.execute('INSERT OR IGNORE INTO keyword_fts (rowid, keyword) VALUES (?, ?)', (keyword_id, keyword))
172
+ logging.info(f"Keyword '{keyword}' added to keyword_fts with ID: {keyword_id}")
173
+ conn.commit()
174
+ return keyword_id
175
+ except sqlite3.IntegrityError as e:
176
+ logging.error(f"Integrity error adding keyword: {e}")
177
+ raise DatabaseError(f"Integrity error adding keyword: {e}")
178
+ except sqlite3.Error as e:
179
+ logging.error(f"Error adding keyword: {e}")
180
+ raise DatabaseError(f"Error adding keyword: {e}")
181
+
182
+
183
+ # Function to delete a keyword
184
+ def delete_keyword(keyword: str) -> str:
185
+ keyword = keyword.strip().lower()
186
+ with db.get_connection() as conn:
187
+ cursor = conn.cursor()
188
+ try:
189
+ cursor.execute('SELECT id FROM Keywords WHERE keyword = ?', (keyword,))
190
+ keyword_id = cursor.fetchone()
191
+ if keyword_id:
192
+ cursor.execute('DELETE FROM Keywords WHERE keyword = ?', (keyword,))
193
+ cursor.execute('DELETE FROM keyword_fts WHERE rowid = ?', (keyword_id[0],))
194
+ conn.commit()
195
+ return f"Keyword '{keyword}' deleted successfully."
196
+ else:
197
+ return f"Keyword '{keyword}' not found."
198
+ except sqlite3.Error as e:
199
+ raise DatabaseError(f"Error deleting keyword: {e}")
200
+
201
+
202
+
203
+ # Function to add media with keywords
204
+ def add_media_with_keywords(url, title, media_type, content, keywords, prompt, summary, transcription_model, author, ingestion_date):
205
+ # Set default values for missing fields
206
+ url = url or 'Unknown'
207
+ title = title or 'Untitled'
208
+ media_type = media_type or 'Unknown'
209
+ content = content or 'No content available'
210
+ keywords = keywords or 'default'
211
+ prompt = prompt or 'No prompt available'
212
+ summary = summary or 'No summary available'
213
+ transcription_model = transcription_model or 'Unknown'
214
+ author = author or 'Unknown'
215
+ ingestion_date = ingestion_date or datetime.now().strftime('%Y-%m-%d')
216
+
217
+ # Use 'localhost' as the URL if no valid URL is provided
218
+ if not is_valid_url(url):
219
+ url = 'localhost'
220
+
221
+ if media_type not in ['document', 'video', 'article']:
222
+ raise InputError("Invalid media type. Allowed types: document, video, article.")
223
+
224
+ if ingestion_date and not is_valid_date(ingestion_date):
225
+ raise InputError("Invalid ingestion date format. Use YYYY-MM-DD.")
226
+
227
+ if not ingestion_date:
228
+ ingestion_date = datetime.now().strftime('%Y-%m-%d')
229
+
230
+ logging.info(f"URL: {url}")
231
+ logging.info(f"Title: {title}")
232
+ logging.info(f"Media Type: {media_type}")
233
+ logging.info(f"Keywords: {keywords}")
234
+ logging.info(f"Content: {content}")
235
+ logging.info(f"Prompt: {prompt}")
236
+ logging.info(f"Summary: {summary}")
237
+ logging.info(f"Author: {author}")
238
+ logging.info(f"Ingestion Date: {ingestion_date}")
239
+ logging.info(f"Transcription Model: {transcription_model}")
240
+
241
+ try:
242
+ with db.get_connection() as conn:
243
+ cursor = conn.cursor()
244
+
245
+ # Initialize keyword_list
246
+ keyword_list = keywords.split(',')
247
+
248
+ # Check if media already exists
249
+ cursor.execute('SELECT id FROM Media WHERE url = ?', (url,))
250
+ existing_media = cursor.fetchone()
251
+
252
+ if existing_media:
253
+ media_id = existing_media[0]
254
+ logger.info(f"Existing media found with ID: {media_id}")
255
+
256
+ # Insert new prompt and summary into MediaModifications
257
+ cursor.execute('''
258
+ INSERT INTO MediaModifications (media_id, prompt, summary, modification_date)
259
+ VALUES (?, ?, ?, ?)
260
+ ''', (media_id, prompt, summary, ingestion_date))
261
+ logger.info("New summary and prompt added to MediaModifications")
262
+ else:
263
+ logger.info("New media entry being created")
264
+
265
+ # Insert new media item
266
+ cursor.execute('''
267
+ INSERT INTO Media (url, title, type, content, author, ingestion_date, transcription_model)
268
+ VALUES (?, ?, ?, ?, ?, ?, ?)
269
+ ''', (url, title, media_type, content, author, ingestion_date, transcription_model))
270
+ media_id = cursor.lastrowid
271
+
272
+ # Insert keywords and associate with media item
273
+ for keyword in keyword_list:
274
+ keyword = keyword.strip().lower()
275
+ cursor.execute('INSERT OR IGNORE INTO Keywords (keyword) VALUES (?)', (keyword,))
276
+ cursor.execute('SELECT id FROM Keywords WHERE keyword = ?', (keyword,))
277
+ keyword_id = cursor.fetchone()[0]
278
+ cursor.execute('INSERT OR IGNORE INTO MediaKeywords (media_id, keyword_id) VALUES (?, ?)', (media_id, keyword_id))
279
+ cursor.execute('INSERT INTO media_fts (rowid, title, content) VALUES (?, ?, ?)', (media_id, title, content))
280
+
281
+ # Also insert the initial prompt and summary into MediaModifications
282
+ cursor.execute('''
283
+ INSERT INTO MediaModifications (media_id, prompt, summary, modification_date)
284
+ VALUES (?, ?, ?, ?)
285
+ ''', (media_id, prompt, summary, ingestion_date))
286
+
287
+ conn.commit()
288
+
289
+ # Insert initial version of the prompt and summary
290
+ add_media_version(media_id, prompt, summary)
291
+
292
+ return f"Media '{title}' added successfully with keywords: {', '.join(keyword_list)}"
293
+ except sqlite3.IntegrityError as e:
294
+ logger.error(f"Integrity Error: {e}")
295
+ raise DatabaseError(f"Integrity error adding media with keywords: {e}")
296
+ except sqlite3.Error as e:
297
+ logger.error(f"SQL Error: {e}")
298
+ raise DatabaseError(f"Error adding media with keywords: {e}")
299
+ except Exception as e:
300
+ logger.error(f"Unexpected Error: {e}")
301
+ raise DatabaseError(f"Unexpected error: {e}")
302
+
303
+
304
+ # Function to add a version of a prompt and summary
305
+ def add_media_version(media_id: int, prompt: str, summary: str) -> None:
306
+ try:
307
+ with db.get_connection() as conn:
308
+ cursor = conn.cursor()
309
+
310
+ # Get the current version number
311
+ cursor.execute('SELECT MAX(version) FROM MediaVersion WHERE media_id = ?', (media_id,))
312
+ current_version = cursor.fetchone()[0] or 0
313
+
314
+ # Insert the new version
315
+ cursor.execute('''
316
+ INSERT INTO MediaVersion (media_id, version, prompt, summary, created_at)
317
+ VALUES (?, ?, ?, ?, ?)
318
+ ''', (media_id, current_version + 1, prompt, summary, datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
319
+ conn.commit()
320
+ except sqlite3.Error as e:
321
+ raise DatabaseError(f"Error adding media version: {e}")
322
+
323
+
324
+ # Function to search the database with advanced options, including keyword search and full-text search
325
+ def search_db(search_query: str, search_fields: List[str], keywords: str, page: int = 1, results_per_page: int = 10) -> Union[List[Tuple], str]:
326
+ # Validate input
327
+ if page < 1:
328
+ raise InputError("Page number must be 1 or greater.")
329
+
330
+ keywords = [keyword.strip().lower() for keyword in keywords.split(',') if keyword.strip()]
331
+ with db.get_connection() as conn:
332
+ cursor = conn.cursor()
333
+ offset = (page - 1) * results_per_page
334
+
335
+ search_conditions = []
336
+ if search_fields:
337
+ search_conditions.append(" OR ".join([f"media_fts.{field} MATCH ?" for field in search_fields]))
338
+ if keywords:
339
+ keyword_conditions = []
340
+ for keyword in keywords:
341
+ keyword_conditions.append("keyword_fts.keyword MATCH ?")
342
+ search_conditions.append(" AND ".join(keyword_conditions))
343
+
344
+ where_clause = " AND ".join(search_conditions)
345
+
346
+ query = f'''
347
+ SELECT Media.url, Media.title, Media.type, Media.content, Media.author, Media.ingestion_date, Media.prompt, Media.summary
348
+ FROM Media
349
+ JOIN media_fts ON Media.id = media_fts.rowid
350
+ JOIN MediaKeywords ON Media.id = MediaKeywords.media_id
351
+ JOIN Keywords ON MediaKeywords.keyword_id = Keywords.id
352
+ JOIN keyword_fts ON Keywords.id = keyword_fts.rowid
353
+ WHERE {where_clause}
354
+ LIMIT ? OFFSET ?
355
+ '''
356
+
357
+ try:
358
+ params = tuple([search_query] * len(search_fields) + keywords)
359
+ cursor.execute(query, params + (results_per_page, offset))
360
+ results = cursor.fetchall()
361
+ if not results:
362
+ return "No results found."
363
+ return results
364
+ except sqlite3.Error as e:
365
+ raise DatabaseError(f"Error executing query: {e}")
366
+
367
+
368
+ # Function to format results for display
369
+ def format_results(results: Union[List[Tuple], str]) -> pd.DataFrame:
370
+ if isinstance(results, str):
371
+ return pd.DataFrame() # Return an empty DataFrame if results is a string
372
+
373
+ df = pd.DataFrame(results,
374
+ columns=['URL', 'Title', 'Type', 'Content', 'Author', 'Ingestion Date', 'Prompt', 'Summary'])
375
+ return df
376
+
377
+
378
+ # Gradio function to handle user input and display results with pagination, with better feedback
379
+ def search_and_display(search_query: str, search_fields: List[str], keyword: str, page: int, dummy: bool = False):
380
+ if not submit:
381
+ return [], gr.update(visible=False)
382
+
383
+ try:
384
+ if not search_query.strip():
385
+ raise InputError("Please enter a valid search query.")
386
+
387
+ results = search_db(search_query, search_fields, keyword, page)
388
+ df = format_results(results)
389
+
390
+ if df.empty:
391
+ return df, gr.update(value="No results found.", visible=True)
392
+ else:
393
+ return df, gr.update(visible=False)
394
+ except (DatabaseError, InputError) as e:
395
+ return pd.DataFrame(), gr.update(value=str(e), visible=True)
396
+
397
+
398
+ # Function to export search results to CSV with pagination
399
+ def export_to_csv(search_query: str, search_fields: List[str], keyword: str, page: int = 1, results_per_file: int = 1000):
400
+ try:
401
+ results = search_db(search_query, search_fields, keyword, page, results_per_file)
402
+ df = format_results(results)
403
+ filename = f'search_results_page_{page}.csv'
404
+ df.to_csv(filename, index=False)
405
+ return f"Results exported to {filename}"
406
+ except (DatabaseError, InputError) as e:
407
+ return str(e)
408
+
409
+
410
+ # Helper function to validate URL format
411
+ def is_valid_url(url: str) -> bool:
412
+ regex = re.compile(
413
+ r'^(?:http|ftp)s?://' # http:// or https://
414
+ r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
415
+ r'localhost|' # localhost...
416
+ r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4
417
+ r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6
418
+ r'(?::\d+)?' # optional port
419
+ r'(?:/?|[/?]\S+)$', re.IGNORECASE)
420
+ return re.match(regex, url) is not None
421
+
422
+
423
+ # Helper function to validate date format
424
+ def is_valid_date(date_string: str) -> bool:
425
+ try:
426
+ datetime.strptime(date_string, '%Y-%m-%d')
427
+ return True
428
+ except ValueError:
429
+ return False
430
+