oceansweep commited on
Commit
ed28876
1 Parent(s): fe42ba0
Files changed (49) hide show
  1. App_Function_Libraries/Article_Extractor_Lib.py +107 -0
  2. App_Function_Libraries/Article_Summarization_Lib.py +284 -0
  3. App_Function_Libraries/Audio_Files.py +629 -0
  4. App_Function_Libraries/Audio_Transcription_Lib.py +158 -0
  5. App_Function_Libraries/Book_Ingestion_Lib.py +95 -0
  6. App_Function_Libraries/Chunk_Lib.py +467 -0
  7. App_Function_Libraries/Diarization_Lib.py +177 -0
  8. App_Function_Libraries/Gradio_Related.py +0 -0
  9. App_Function_Libraries/LLM_API_Calls.py +633 -0
  10. App_Function_Libraries/LLM_API_Calls_Local.py +348 -0
  11. App_Function_Libraries/Local_File_Processing_Lib.py +90 -0
  12. App_Function_Libraries/Local_LLM_Inference_Engine_Lib.py +590 -0
  13. App_Function_Libraries/Local_Summarization_Lib.py +467 -0
  14. App_Function_Libraries/Markdown_Export-improvement.py +234 -0
  15. App_Function_Libraries/Obsidian-Importer.py +210 -0
  16. App_Function_Libraries/Old_Chunking_Lib.py +159 -0
  17. App_Function_Libraries/PDF_Ingestion_Lib.py +166 -0
  18. App_Function_Libraries/RAG_Library.py +812 -0
  19. App_Function_Libraries/SQLite_DB.py +973 -0
  20. App_Function_Libraries/Summarization_General_Lib.py +1388 -0
  21. App_Function_Libraries/System_Checks_Lib.py +184 -0
  22. App_Function_Libraries/Tokenization_Methods_Lib.py +30 -0
  23. App_Function_Libraries/Tone-Changer.py +46 -0
  24. App_Function_Libraries/Utils.py +440 -0
  25. App_Function_Libraries/Video_DL_Ingestion_Lib.py +315 -0
  26. App_Function_Libraries/__Init__.py +0 -0
  27. App_Function_Libraries/__pycache__/Article_Extractor_Lib.cpython-312.pyc +0 -0
  28. App_Function_Libraries/__pycache__/Article_Summarization_Lib.cpython-312.pyc +0 -0
  29. App_Function_Libraries/__pycache__/Audio_Files.cpython-312.pyc +0 -0
  30. App_Function_Libraries/__pycache__/Audio_Transcription_Lib.cpython-312.pyc +0 -0
  31. App_Function_Libraries/__pycache__/Book_Ingestion_Lib.cpython-312.pyc +0 -0
  32. App_Function_Libraries/__pycache__/Chunk_Lib.cpython-312.pyc +0 -0
  33. App_Function_Libraries/__pycache__/Diarization_Lib.cpython-312.pyc +0 -0
  34. App_Function_Libraries/__pycache__/Gradio_Related.cpython-312.pyc +0 -0
  35. App_Function_Libraries/__pycache__/LLM_API_Calls.cpython-312.pyc +0 -0
  36. App_Function_Libraries/__pycache__/Local_File_Processing_Lib.cpython-312.pyc +0 -0
  37. App_Function_Libraries/__pycache__/Local_LLM_Inference_Engine_Lib.cpython-312.pyc +0 -0
  38. App_Function_Libraries/__pycache__/Local_Summarization_Lib.cpython-312.pyc +0 -0
  39. App_Function_Libraries/__pycache__/Old_Chunking_Lib.cpython-312.pyc +0 -0
  40. App_Function_Libraries/__pycache__/PDF_Ingestion_Lib.cpython-312.pyc +0 -0
  41. App_Function_Libraries/__pycache__/SQLite_DB.cpython-312.pyc +0 -0
  42. App_Function_Libraries/__pycache__/Summarization_General_Lib.cpython-312.pyc +0 -0
  43. App_Function_Libraries/__pycache__/System_Checks_Lib.cpython-312.pyc +0 -0
  44. App_Function_Libraries/__pycache__/Tokenization_Methods_Lib.cpython-312.pyc +0 -0
  45. App_Function_Libraries/__pycache__/Utils.cpython-312.pyc +0 -0
  46. App_Function_Libraries/__pycache__/Video_DL_Ingestion_Lib.cpython-312.pyc +0 -0
  47. App_Function_Libraries/__pycache__/__init__.cpython-312.pyc +0 -0
  48. App_Function_Libraries/models/config.yaml +21 -0
  49. app.py +0 -0
App_Function_Libraries/Article_Extractor_Lib.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Article_Extractor_Lib.py
2
+ #########################################
3
+ # Article Extraction Library
4
+ # This library is used to handle scraping and extraction of articles from web pages.
5
+ # Currently, uses a combination of beatifulsoup4 and trafilatura to extract article text.
6
+ # Firecrawl would be a better option for this, but it is not yet implemented.
7
+ ####
8
+ #
9
+ ####################
10
+ # Function List
11
+ #
12
+ # 1. get_page_title(url)
13
+ # 2. get_article_text(url)
14
+ # 3. get_article_title(article_url_arg)
15
+ #
16
+ ####################
17
+ #
18
+ # Import necessary libraries
19
+ import logging
20
+ # 3rd-Party Imports
21
+ import asyncio
22
+ from playwright.async_api import async_playwright
23
+ from bs4 import BeautifulSoup
24
+ import requests
25
+ import trafilatura
26
+ # Import Local
27
+ #
28
+ #######################################################################################################################
29
+ # Function Definitions
30
+ #
31
+
32
+ def get_page_title(url: str) -> str:
33
+ try:
34
+ response = requests.get(url)
35
+ response.raise_for_status()
36
+ soup = BeautifulSoup(response.text, 'html.parser')
37
+ title_tag = soup.find('title')
38
+ return title_tag.string.strip() if title_tag else "Untitled"
39
+ except requests.RequestException as e:
40
+ logging.error(f"Error fetching page title: {e}")
41
+ return "Untitled"
42
+
43
+
44
+ def get_artice_title(article_url_arg: str) -> str:
45
+ # Use beautifulsoup to get the page title - Really should be using ytdlp for this....
46
+ article_title = get_page_title(article_url_arg)
47
+
48
+
49
+ def scrape_article(url):
50
+ async def fetch_html(url: str) -> str:
51
+ async with async_playwright() as p:
52
+ browser = await p.chromium.launch(headless=True)
53
+ context = await browser.new_context(
54
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3")
55
+ page = await context.new_page()
56
+ await page.goto(url)
57
+ await page.wait_for_load_state("networkidle") # Wait for the network to be idle
58
+ content = await page.content()
59
+ await browser.close()
60
+ return content
61
+
62
+ def extract_article_data(html: str) -> dict:
63
+ downloaded = trafilatura.extract(html, include_comments=False, include_tables=False, include_images=False)
64
+ if downloaded:
65
+ metadata = trafilatura.extract_metadata(html)
66
+ if metadata:
67
+ return {
68
+ 'title': metadata.title if metadata.title else 'N/A',
69
+ 'author': metadata.author if metadata.author else 'N/A',
70
+ 'content': downloaded,
71
+ 'date': metadata.date if metadata.date else 'N/A',
72
+ }
73
+ else:
74
+ print("Metadata extraction failed.")
75
+ return None
76
+ else:
77
+ print("Content extraction failed.")
78
+ return None
79
+
80
+ def convert_html_to_markdown(html: str) -> str:
81
+ soup = BeautifulSoup(html, 'html.parser')
82
+ # Convert each paragraph to markdown
83
+ for para in soup.find_all('p'):
84
+ para.append('\n') # Add a newline at the end of each paragraph for markdown separation
85
+
86
+ # Use .get_text() with separator to keep paragraph separation
87
+ text = soup.get_text(separator='\n\n')
88
+
89
+ return text
90
+
91
+ async def fetch_and_extract_article(url: str):
92
+ html = await fetch_html(url)
93
+ print("HTML Content:", html[:500]) # Print first 500 characters of the HTML for inspection
94
+ article_data = extract_article_data(html)
95
+ if article_data:
96
+ article_data['content'] = convert_html_to_markdown(article_data['content'])
97
+ return article_data
98
+ else:
99
+ return None
100
+
101
+ # Using asyncio.run to handle event loop creation and execution
102
+ article_data = asyncio.run(fetch_and_extract_article(url))
103
+ return article_data
104
+
105
+ #
106
+ #
107
+ #######################################################################################################################
App_Function_Libraries/Article_Summarization_Lib.py ADDED
@@ -0,0 +1,284 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Article_Summarization_Lib.py
2
+ #########################################
3
+ # Article Summarization Library
4
+ # This library is used to handle summarization of articles.
5
+
6
+ #
7
+ ####
8
+ #
9
+ ####################
10
+ # Function List
11
+ #
12
+ # 1.
13
+ #
14
+ ####################
15
+ #
16
+ # Import necessary libraries
17
+ import datetime
18
+ from datetime import datetime
19
+ import gradio as gr
20
+ import json
21
+ import os
22
+ import logging
23
+ import requests
24
+ # 3rd-Party Imports
25
+ from tqdm import tqdm
26
+
27
+ from App_Function_Libraries.Utils import sanitize_filename
28
+ # Local Imports
29
+ from Article_Extractor_Lib import scrape_article
30
+ from Local_Summarization_Lib import summarize_with_llama, summarize_with_oobabooga, summarize_with_tabbyapi, \
31
+ summarize_with_vllm, summarize_with_kobold, save_summary_to_file, summarize_with_local_llm
32
+ from Summarization_General_Lib import summarize_with_openai, summarize_with_anthropic, summarize_with_cohere, summarize_with_groq, summarize_with_openrouter, summarize_with_deepseek, summarize_with_huggingface
33
+ from SQLite_DB import Database, create_tables, add_media_with_keywords
34
+ #
35
+ #######################################################################################################################
36
+ # Function Definitions
37
+ #
38
+
39
+ def ingest_article_to_db(url, title, author, content, keywords, summary, ingestion_date, custom_prompt):
40
+ try:
41
+ # Check if content is not empty or whitespace
42
+ if not content.strip():
43
+ raise ValueError("Content is empty.")
44
+
45
+ db = Database()
46
+ create_tables()
47
+ keyword_list = keywords.split(',') if keywords else ["default"]
48
+ keyword_str = ', '.join(keyword_list)
49
+
50
+ # Set default values for missing fields
51
+ url = url or 'Unknown'
52
+ title = title or 'Unknown'
53
+ author = author or 'Unknown'
54
+ keywords = keywords or 'default'
55
+ summary = summary or 'No summary available'
56
+ ingestion_date = ingestion_date or datetime.datetime.now().strftime('%Y-%m-%d')
57
+
58
+ # Log the values of all fields before calling add_media_with_keywords
59
+ logging.debug(f"URL: {url}")
60
+ logging.debug(f"Title: {title}")
61
+ logging.debug(f"Author: {author}")
62
+ logging.debug(f"Content: {content[:50]}... (length: {len(content)})") # Log first 50 characters of content
63
+ logging.debug(f"Keywords: {keywords}")
64
+ logging.debug(f"Summary: {summary}")
65
+ logging.debug(f"Ingestion Date: {ingestion_date}")
66
+ logging.debug(f"Custom Prompt: {custom_prompt}")
67
+
68
+ # Check if any required field is empty and log the specific missing field
69
+ if not url:
70
+ logging.error("URL is missing.")
71
+ raise ValueError("URL is missing.")
72
+ if not title:
73
+ logging.error("Title is missing.")
74
+ raise ValueError("Title is missing.")
75
+ if not content:
76
+ logging.error("Content is missing.")
77
+ raise ValueError("Content is missing.")
78
+ if not keywords:
79
+ logging.error("Keywords are missing.")
80
+ raise ValueError("Keywords are missing.")
81
+ if not summary:
82
+ logging.error("Summary is missing.")
83
+ raise ValueError("Summary is missing.")
84
+ if not ingestion_date:
85
+ logging.error("Ingestion date is missing.")
86
+ raise ValueError("Ingestion date is missing.")
87
+ if not custom_prompt:
88
+ logging.error("Custom prompt is missing.")
89
+ raise ValueError("Custom prompt is missing.")
90
+
91
+ # Add media with keywords to the database
92
+ result = add_media_with_keywords(
93
+ url=url,
94
+ title=title,
95
+ media_type='article',
96
+ content=content,
97
+ keywords=keyword_str or "article_default",
98
+ prompt=custom_prompt or None,
99
+ summary=summary or "No summary generated",
100
+ transcription_model=None, # or some default value if applicable
101
+ author=author or 'Unknown',
102
+ ingestion_date=ingestion_date
103
+ )
104
+ return result
105
+ except Exception as e:
106
+ logging.error(f"Failed to ingest article to the database: {e}")
107
+ return str(e)
108
+
109
+
110
+ def scrape_and_summarize_multiple(urls, custom_prompt_arg, api_name, api_key, keywords, custom_article_titles):
111
+ urls = [url.strip() for url in urls.split('\n') if url.strip()]
112
+ custom_titles = custom_article_titles.split('\n') if custom_article_titles else []
113
+
114
+ results = []
115
+ errors = []
116
+
117
+ # Create a progress bar
118
+ progress = gr.Progress()
119
+
120
+ for i, url in tqdm(enumerate(urls), total=len(urls), desc="Processing URLs"):
121
+ custom_title = custom_titles[i] if i < len(custom_titles) else None
122
+ try:
123
+ result = scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, custom_title)
124
+ results.append(f"Results for URL {i + 1}:\n{result}")
125
+ except Exception as e:
126
+ error_message = f"Error processing URL {i + 1} ({url}): {str(e)}"
127
+ errors.append(error_message)
128
+ results.append(f"Failed to process URL {i + 1}: {url}")
129
+
130
+ # Update progress
131
+ progress((i + 1) / len(urls), desc=f"Processed {i + 1}/{len(urls)} URLs")
132
+
133
+ # Combine results and errors
134
+ combined_output = "\n".join(results)
135
+ if errors:
136
+ combined_output += "\n\nErrors encountered:\n" + "\n".join(errors)
137
+
138
+ return combined_output
139
+
140
+
141
+ def scrape_and_summarize(url, custom_prompt_arg, api_name, api_key, keywords, custom_article_title):
142
+ try:
143
+ # Step 1: Scrape the article
144
+ article_data = scrape_article(url)
145
+ print(f"Scraped Article Data: {article_data}") # Debugging statement
146
+ if not article_data:
147
+ return "Failed to scrape the article."
148
+
149
+ # Use the custom title if provided, otherwise use the scraped title
150
+ title = custom_article_title.strip() if custom_article_title else article_data.get('title', 'Untitled')
151
+ author = article_data.get('author', 'Unknown')
152
+ content = article_data.get('content', '')
153
+ ingestion_date = datetime.now().strftime('%Y-%m-%d')
154
+
155
+ print(f"Title: {title}, Author: {author}, Content Length: {len(content)}") # Debugging statement
156
+
157
+ # Custom prompt for the article
158
+ article_custom_prompt = custom_prompt_arg or "Summarize this article."
159
+
160
+ # Step 2: Summarize the article
161
+ summary = None
162
+ if api_name:
163
+ logging.debug(f"Article_Summarizer: Summarization being performed by {api_name}")
164
+
165
+ # Sanitize filename for saving the JSON file
166
+ sanitized_title = sanitize_filename(title)
167
+ json_file_path = os.path.join("Results", f"{sanitized_title}_segments.json")
168
+
169
+ with open(json_file_path, 'w') as json_file:
170
+ json.dump([{'text': content}], json_file, indent=2)
171
+
172
+ try:
173
+ if api_name.lower() == 'openai':
174
+ # def summarize_with_openai(api_key, input_data, custom_prompt_arg)
175
+ summary = summarize_with_openai(api_key, json_file_path, article_custom_prompt)
176
+
177
+ elif api_name.lower() == "anthropic":
178
+ # def summarize_with_anthropic(api_key, input_data, model, custom_prompt_arg, max_retries=3, retry_delay=5):
179
+ summary = summarize_with_anthropic(api_key, json_file_path, article_custom_prompt)
180
+ elif api_name.lower() == "cohere":
181
+ # def summarize_with_cohere(api_key, input_data, model, custom_prompt_arg)
182
+ summary = summarize_with_cohere(api_key, json_file_path, article_custom_prompt)
183
+
184
+ elif api_name.lower() == "groq":
185
+ logging.debug(f"MAIN: Trying to summarize with groq")
186
+ # def summarize_with_groq(api_key, input_data, model, custom_prompt_arg):
187
+ summary = summarize_with_groq(api_key, json_file_path, article_custom_prompt)
188
+
189
+ elif api_name.lower() == "openrouter":
190
+ logging.debug(f"MAIN: Trying to summarize with OpenRouter")
191
+ # def summarize_with_openrouter(api_key, input_data, custom_prompt_arg):
192
+ summary = summarize_with_openrouter(api_key, json_file_path, article_custom_prompt)
193
+
194
+ elif api_name.lower() == "deepseek":
195
+ logging.debug(f"MAIN: Trying to summarize with DeepSeek")
196
+ # def summarize_with_deepseek(api_key, input_data, custom_prompt_arg):
197
+ summary = summarize_with_deepseek(api_key, json_file_path, article_custom_prompt)
198
+
199
+ elif api_name.lower() == "llama.cpp":
200
+ logging.debug(f"MAIN: Trying to summarize with Llama.cpp")
201
+ # def summarize_with_llama(api_url, file_path, token, custom_prompt)
202
+ summary = summarize_with_llama(json_file_path, article_custom_prompt)
203
+
204
+ elif api_name.lower() == "kobold":
205
+ logging.debug(f"MAIN: Trying to summarize with Kobold.cpp")
206
+ # def summarize_with_kobold(input_data, kobold_api_token, custom_prompt_input, api_url):
207
+ summary = summarize_with_kobold(json_file_path, api_key, article_custom_prompt)
208
+
209
+ elif api_name.lower() == "ooba":
210
+ # def summarize_with_oobabooga(input_data, api_key, custom_prompt, api_url):
211
+ summary = summarize_with_oobabooga(json_file_path, api_key, article_custom_prompt)
212
+
213
+ elif api_name.lower() == "tabbyapi":
214
+ # def summarize_with_tabbyapi(input_data, tabby_model, custom_prompt_input, api_key=None, api_IP):
215
+ summary = summarize_with_tabbyapi(json_file_path, article_custom_prompt)
216
+
217
+ elif api_name.lower() == "vllm":
218
+ logging.debug(f"MAIN: Trying to summarize with VLLM")
219
+ # def summarize_with_vllm(api_key, input_data, custom_prompt_input):
220
+ summary = summarize_with_vllm(json_file_path, article_custom_prompt)
221
+
222
+ elif api_name.lower() == "local-llm":
223
+ logging.debug(f"MAIN: Trying to summarize with Local LLM")
224
+ summary = summarize_with_local_llm(json_file_path, article_custom_prompt)
225
+
226
+ elif api_name.lower() == "huggingface":
227
+ logging.debug(f"MAIN: Trying to summarize with huggingface")
228
+ # def summarize_with_huggingface(api_key, input_data, custom_prompt_arg):
229
+ summarize_with_huggingface(api_key, json_file_path, article_custom_prompt)
230
+ # Add additional API handlers here...
231
+ except requests.exceptions.ConnectionError as e:
232
+ logging.error(f"Connection error while trying to summarize with {api_name}: {str(e)}")
233
+
234
+ if summary:
235
+ logging.info(f"Article_Summarizer: Summary generated using {api_name} API")
236
+ save_summary_to_file(summary, json_file_path)
237
+ else:
238
+ summary = "Summary not available"
239
+ logging.warning(f"Failed to generate summary using {api_name} API")
240
+
241
+ else:
242
+ summary = "Article Summarization: No API provided for summarization."
243
+
244
+ print(f"Summary: {summary}") # Debugging statement
245
+
246
+ # Step 3: Ingest the article into the database
247
+ ingestion_result = ingest_article_to_db(url, title, author, content, keywords, summary, ingestion_date,
248
+ article_custom_prompt)
249
+
250
+ return f"Title: {title}\nAuthor: {author}\nIngestion Result: {ingestion_result}\n\nSummary: {summary}\n\nArticle Contents: {content}"
251
+ except Exception as e:
252
+ logging.error(f"Error processing URL {url}: {str(e)}")
253
+ return f"Failed to process URL {url}: {str(e)}"
254
+
255
+
256
+ def ingest_unstructured_text(text, custom_prompt, api_name, api_key, keywords, custom_article_title):
257
+ title = custom_article_title.strip() if custom_article_title else "Unstructured Text"
258
+ author = "Unknown"
259
+ ingestion_date = datetime.now().strftime('%Y-%m-%d')
260
+
261
+ # Summarize the unstructured text
262
+ if api_name:
263
+ json_file_path = f"Results/{title.replace(' ', '_')}_segments.json"
264
+ with open(json_file_path, 'w') as json_file:
265
+ json.dump([{'text': text}], json_file, indent=2)
266
+
267
+ if api_name.lower() == 'openai':
268
+ summary = summarize_with_openai(api_key, json_file_path, custom_prompt)
269
+ # Add other APIs as needed
270
+ else:
271
+ summary = "Unsupported API."
272
+ else:
273
+ summary = "No API provided for summarization."
274
+
275
+ # Ingest the unstructured text into the database
276
+ ingestion_result = ingest_article_to_db('Unstructured Text', title, author, text, keywords, summary, ingestion_date,
277
+ custom_prompt)
278
+ return f"Title: {title}\nSummary: {summary}\nIngestion Result: {ingestion_result}"
279
+
280
+
281
+
282
+ #
283
+ #
284
+ #######################################################################################################################
App_Function_Libraries/Audio_Files.py ADDED
@@ -0,0 +1,629 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Audio_Files.py
2
+ #########################################
3
+ # Audio Processing Library
4
+ # This library is used to download or load audio files from a local directory.
5
+ #
6
+ ####
7
+ #
8
+ # Functions:
9
+ #
10
+ # download_audio_file(url, save_path)
11
+ # process_audio(
12
+ # process_audio_file(audio_url, audio_file, whisper_model="small.en", api_name=None, api_key=None)
13
+ #
14
+ #
15
+ #########################################
16
+ # Imports
17
+ import json
18
+ import logging
19
+ import subprocess
20
+ import sys
21
+ import tempfile
22
+ import uuid
23
+ from datetime import datetime
24
+
25
+ import requests
26
+ import os
27
+ from gradio import gradio
28
+ import yt_dlp
29
+
30
+ from App_Function_Libraries.Audio_Transcription_Lib import speech_to_text
31
+ from App_Function_Libraries.Chunk_Lib import improved_chunking_process
32
+ #
33
+ # Local Imports
34
+ from App_Function_Libraries.SQLite_DB import add_media_to_database, add_media_with_keywords
35
+ from App_Function_Libraries.Utils import create_download_directory, save_segments_to_json
36
+ from App_Function_Libraries.Summarization_General_Lib import save_transcription_and_summary, perform_transcription, \
37
+ perform_summarization
38
+ from App_Function_Libraries.Video_DL_Ingestion_Lib import extract_metadata
39
+
40
+ #
41
+ #######################################################################################################################
42
+ # Function Definitions
43
+ #
44
+
45
+ MAX_FILE_SIZE = 500 * 1024 * 1024
46
+
47
+
48
+ def download_audio_file(url, use_cookies=False, cookies=None):
49
+ try:
50
+ # Set up the request headers
51
+ headers = {}
52
+ if use_cookies and cookies:
53
+ try:
54
+ cookie_dict = json.loads(cookies)
55
+ headers['Cookie'] = '; '.join([f'{k}={v}' for k, v in cookie_dict.items()])
56
+ except json.JSONDecodeError:
57
+ logging.warning("Invalid cookie format. Proceeding without cookies.")
58
+
59
+ # Make the request
60
+ response = requests.get(url, headers=headers, stream=True)
61
+ response.raise_for_status() # Raise an exception for bad status codes
62
+
63
+ # Get the file size
64
+ file_size = int(response.headers.get('content-length', 0))
65
+ if file_size > 500 * 1024 * 1024: # 500 MB limit
66
+ raise ValueError("File size exceeds the 500MB limit.")
67
+
68
+ # Generate a unique filename
69
+ file_name = f"audio_{uuid.uuid4().hex[:8]}.mp3"
70
+ save_path = os.path.join('downloads', file_name)
71
+
72
+ # Ensure the downloads directory exists
73
+ os.makedirs('downloads', exist_ok=True)
74
+
75
+ # Download the file
76
+ with open(save_path, 'wb') as f:
77
+ for chunk in response.iter_content(chunk_size=8192):
78
+ if chunk:
79
+ f.write(chunk)
80
+
81
+ logging.info(f"Audio file downloaded successfully: {save_path}")
82
+ return save_path
83
+
84
+ except requests.RequestException as e:
85
+ logging.error(f"Error downloading audio file: {str(e)}")
86
+ raise
87
+ except ValueError as e:
88
+ logging.error(str(e))
89
+ raise
90
+ except Exception as e:
91
+ logging.error(f"Unexpected error downloading audio file: {str(e)}")
92
+ raise
93
+
94
+
95
+ def process_audio(
96
+ audio_file_path,
97
+ num_speakers=2,
98
+ whisper_model="small.en",
99
+ custom_prompt_input=None,
100
+ offset=0,
101
+ api_name=None,
102
+ api_key=None,
103
+ vad_filter=False,
104
+ rolling_summarization=False,
105
+ detail_level=0.01,
106
+ keywords="default,no_keyword_set",
107
+ chunk_text_by_words=False,
108
+ max_words=0,
109
+ chunk_text_by_sentences=False,
110
+ max_sentences=0,
111
+ chunk_text_by_paragraphs=False,
112
+ max_paragraphs=0,
113
+ chunk_text_by_tokens=False,
114
+ max_tokens=0
115
+ ):
116
+ try:
117
+
118
+ # Perform transcription
119
+ audio_file_path, segments = perform_transcription(audio_file_path, offset, whisper_model, vad_filter)
120
+
121
+ if audio_file_path is None or segments is None:
122
+ logging.error("Process_Audio: Transcription failed or segments not available.")
123
+ return "Process_Audio: Transcription failed.", None, None, None, None, None
124
+
125
+ logging.debug(f"Process_Audio: Transcription audio_file: {audio_file_path}")
126
+ logging.debug(f"Process_Audio: Transcription segments: {segments}")
127
+
128
+ transcription_text = {'audio_file': audio_file_path, 'transcription': segments}
129
+ logging.debug(f"Process_Audio: Transcription text: {transcription_text}")
130
+
131
+ # Save segments to JSON
132
+ segments_json_path = save_segments_to_json(segments)
133
+
134
+ # Perform summarization
135
+ summary_text = None
136
+ if api_name:
137
+ if rolling_summarization is not None:
138
+ pass
139
+ # FIXME rolling summarization
140
+ # summary_text = rolling_summarize_function(
141
+ # transcription_text,
142
+ # detail=detail_level,
143
+ # api_name=api_name,
144
+ # api_key=api_key,
145
+ # custom_prompt=custom_prompt_input,
146
+ # chunk_by_words=chunk_text_by_words,
147
+ # max_words=max_words,
148
+ # chunk_by_sentences=chunk_text_by_sentences,
149
+ # max_sentences=max_sentences,
150
+ # chunk_by_paragraphs=chunk_text_by_paragraphs,
151
+ # max_paragraphs=max_paragraphs,
152
+ # chunk_by_tokens=chunk_text_by_tokens,
153
+ # max_tokens=max_tokens
154
+ # )
155
+ else:
156
+ summary_text = perform_summarization(api_name, segments_json_path, custom_prompt_input, api_key)
157
+
158
+ if summary_text is None:
159
+ logging.error("Summary text is None. Check summarization function.")
160
+ summary_file_path = None
161
+ else:
162
+ summary_text = 'Summary not available'
163
+ summary_file_path = None
164
+
165
+ # Save transcription and summary
166
+ download_path = create_download_directory("Audio_Processing")
167
+ json_file_path, summary_file_path = save_transcription_and_summary(transcription_text, summary_text,
168
+ download_path)
169
+
170
+ # Update function call to add_media_to_database so that it properly applies the title, author and file type
171
+ # Add to database
172
+ add_media_to_database(None, {'title': 'Audio File', 'author': 'Unknown'}, segments, summary_text, keywords,
173
+ custom_prompt_input, whisper_model)
174
+
175
+ return transcription_text, summary_text, json_file_path, summary_file_path, None, None
176
+
177
+ except Exception as e:
178
+ logging.error(f"Error in process_audio: {str(e)}")
179
+ return str(e), None, None, None, None, None
180
+
181
+
182
+ def process_single_audio(audio_file_path, whisper_model, api_name, api_key, keep_original,custom_keywords, source,
183
+ custom_prompt_input, chunk_method, max_chunk_size, chunk_overlap, use_adaptive_chunking,
184
+ use_multi_level_chunking, chunk_language):
185
+ progress = []
186
+ transcription = ""
187
+ summary = ""
188
+
189
+ def update_progress(message):
190
+ progress.append(message)
191
+ return "\n".join(progress)
192
+
193
+ try:
194
+ # Check file size before processing
195
+ file_size = os.path.getsize(audio_file_path)
196
+ if file_size > MAX_FILE_SIZE:
197
+ update_progress(f"File size ({file_size / (1024 * 1024):.2f} MB) exceeds the maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f} MB. Skipping this file.")
198
+ return "\n".join(progress), "", ""
199
+
200
+ # Perform transcription
201
+ update_progress("Starting transcription...")
202
+ segments = speech_to_text(audio_file_path, whisper_model=whisper_model)
203
+ transcription = " ".join([segment['Text'] for segment in segments])
204
+ update_progress("Audio transcribed successfully.")
205
+
206
+ # Perform summarization if API is provided
207
+ if api_name and api_key:
208
+ update_progress("Starting summarization...")
209
+ summary = perform_summarization(api_name, transcription, "Summarize the following audio transcript",
210
+ api_key)
211
+ update_progress("Audio summarized successfully.")
212
+ else:
213
+ summary = "No summary available"
214
+
215
+ # Prepare keywords
216
+ keywords = "audio,transcription"
217
+ if custom_keywords:
218
+ keywords += f",{custom_keywords}"
219
+
220
+ # Add to database
221
+ add_media_with_keywords(
222
+ url=source,
223
+ title=os.path.basename(audio_file_path),
224
+ media_type='audio',
225
+ content=transcription,
226
+ keywords=keywords,
227
+ prompt="Summarize the following audio transcript",
228
+ summary=summary,
229
+ transcription_model=whisper_model,
230
+ author="Unknown",
231
+ ingestion_date=None # This will use the current date
232
+ )
233
+ update_progress("Audio file added to database successfully.")
234
+
235
+ if not keep_original and source != "Uploaded File":
236
+ os.remove(audio_file_path)
237
+ update_progress(f"Temporary file {audio_file_path} removed.")
238
+ elif keep_original and source != "Uploaded File":
239
+ update_progress(f"Original audio file kept at: {audio_file_path}")
240
+
241
+ except Exception as e:
242
+ update_progress(f"Error processing {source}: {str(e)}")
243
+ transcription = f"Error: {str(e)}"
244
+ summary = "No summary due to error"
245
+
246
+ return "\n".join(progress), transcription, summary
247
+
248
+
249
+ def process_audio_files(audio_urls, audio_file, whisper_model, api_name, api_key, use_cookies, cookies, keep_original,
250
+ custom_keywords, custom_prompt_input, chunk_method, max_chunk_size, chunk_overlap,
251
+ use_adaptive_chunking, use_multi_level_chunking, chunk_language, diarize):
252
+ progress = []
253
+ temp_files = []
254
+ all_transcriptions = []
255
+ all_summaries = []
256
+
257
+ def update_progress(message):
258
+ progress.append(message)
259
+ return "\n".join(progress)
260
+
261
+ def cleanup_files():
262
+ for file in temp_files:
263
+ try:
264
+ if os.path.exists(file):
265
+ os.remove(file)
266
+ update_progress(f"Temporary file {file} removed.")
267
+ except Exception as e:
268
+ update_progress(f"Failed to remove temporary file {file}: {str(e)}")
269
+
270
+ def reencode_mp3(mp3_file_path):
271
+ try:
272
+ reencoded_mp3_path = mp3_file_path.replace(".mp3", "_reencoded.mp3")
273
+ subprocess.run([ffmpeg_cmd, '-i', mp3_file_path, '-codec:a', 'libmp3lame', reencoded_mp3_path], check=True)
274
+ update_progress(f"Re-encoded {mp3_file_path} to {reencoded_mp3_path}.")
275
+ return reencoded_mp3_path
276
+ except subprocess.CalledProcessError as e:
277
+ update_progress(f"Error re-encoding {mp3_file_path}: {str(e)}")
278
+ raise
279
+
280
+ def convert_mp3_to_wav(mp3_file_path):
281
+ try:
282
+ wav_file_path = mp3_file_path.replace(".mp3", ".wav")
283
+ subprocess.run([ffmpeg_cmd, '-i', mp3_file_path, wav_file_path], check=True)
284
+ update_progress(f"Converted {mp3_file_path} to {wav_file_path}.")
285
+ return wav_file_path
286
+ except subprocess.CalledProcessError as e:
287
+ update_progress(f"Error converting {mp3_file_path} to WAV: {str(e)}")
288
+ raise
289
+
290
+ try:
291
+ # Check and set the ffmpeg command
292
+ global ffmpeg_cmd
293
+ if os.name == "nt":
294
+ logging.debug("Running on Windows")
295
+ ffmpeg_cmd = os.path.join(os.getcwd(), "Bin", "ffmpeg.exe")
296
+ else:
297
+ ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
298
+
299
+ # Ensure ffmpeg is accessible
300
+ if not os.path.exists(ffmpeg_cmd) and os.name == "nt":
301
+ raise FileNotFoundError(f"ffmpeg executable not found at path: {ffmpeg_cmd}")
302
+
303
+ # Define chunk options early to avoid undefined errors
304
+ chunk_options = {
305
+ 'method': chunk_method,
306
+ 'max_size': max_chunk_size,
307
+ 'overlap': chunk_overlap,
308
+ 'adaptive': use_adaptive_chunking,
309
+ 'multi_level': use_multi_level_chunking,
310
+ 'language': chunk_language
311
+ }
312
+
313
+ # Process multiple URLs
314
+ urls = [url.strip() for url in audio_urls.split('\n') if url.strip()]
315
+
316
+ for i, url in enumerate(urls):
317
+ update_progress(f"Processing URL {i + 1}/{len(urls)}: {url}")
318
+
319
+ # Download and process audio file
320
+ audio_file_path = download_audio_file(url, use_cookies, cookies)
321
+ if not os.path.exists(audio_file_path):
322
+ update_progress(f"Downloaded file not found: {audio_file_path}")
323
+ continue
324
+
325
+ temp_files.append(audio_file_path)
326
+ update_progress("Audio file downloaded successfully.")
327
+
328
+ # Re-encode MP3 to fix potential issues
329
+ reencoded_mp3_path = reencode_mp3(audio_file_path)
330
+ if not os.path.exists(reencoded_mp3_path):
331
+ update_progress(f"Re-encoded file not found: {reencoded_mp3_path}")
332
+ continue
333
+
334
+ temp_files.append(reencoded_mp3_path)
335
+
336
+ # Convert re-encoded MP3 to WAV
337
+ wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
338
+ if not os.path.exists(wav_file_path):
339
+ update_progress(f"Converted WAV file not found: {wav_file_path}")
340
+ continue
341
+
342
+ temp_files.append(wav_file_path)
343
+
344
+ # Initialize transcription
345
+ transcription = ""
346
+
347
+ # Transcribe audio
348
+ if diarize:
349
+ segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=True)
350
+ else:
351
+ segments = speech_to_text(wav_file_path, whisper_model=whisper_model)
352
+
353
+ # Handle segments nested under 'segments' key
354
+ if isinstance(segments, dict) and 'segments' in segments:
355
+ segments = segments['segments']
356
+
357
+ if isinstance(segments, list):
358
+ transcription = " ".join([segment.get('Text', '') for segment in segments])
359
+ update_progress("Audio transcribed successfully.")
360
+ else:
361
+ update_progress("Unexpected segments format received from speech_to_text.")
362
+ logging.error(f"Unexpected segments format: {segments}")
363
+ continue
364
+
365
+ if not transcription.strip():
366
+ update_progress("Transcription is empty.")
367
+ else:
368
+ # Apply chunking
369
+ chunked_text = improved_chunking_process(transcription, chunk_options)
370
+
371
+ # Summarize
372
+ if api_name:
373
+ try:
374
+ summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
375
+ update_progress("Audio summarized successfully.")
376
+ except Exception as e:
377
+ logging.error(f"Error during summarization: {str(e)}")
378
+ summary = "Summary generation failed"
379
+ else:
380
+ summary = "No summary available (API not provided)"
381
+
382
+ all_transcriptions.append(transcription)
383
+ all_summaries.append(summary)
384
+
385
+ # Add to database
386
+ add_media_with_keywords(
387
+ url=url,
388
+ title=os.path.basename(wav_file_path),
389
+ media_type='audio',
390
+ content=transcription,
391
+ keywords=custom_keywords,
392
+ prompt=custom_prompt_input,
393
+ summary=summary,
394
+ transcription_model=whisper_model,
395
+ author="Unknown",
396
+ ingestion_date=datetime.now().strftime('%Y-%m-%d')
397
+ )
398
+ update_progress("Audio file processed and added to database.")
399
+
400
+ # Process uploaded file if provided
401
+ if audio_file:
402
+ if os.path.getsize(audio_file.name) > MAX_FILE_SIZE:
403
+ update_progress(
404
+ f"Uploaded file size exceeds the maximum limit of {MAX_FILE_SIZE / (1024 * 1024):.2f}MB. Skipping this file.")
405
+ else:
406
+ # Re-encode MP3 to fix potential issues
407
+ reencoded_mp3_path = reencode_mp3(audio_file.name)
408
+ if not os.path.exists(reencoded_mp3_path):
409
+ update_progress(f"Re-encoded file not found: {reencoded_mp3_path}")
410
+ return update_progress("Processing failed: Re-encoded file not found"), "", ""
411
+
412
+ temp_files.append(reencoded_mp3_path)
413
+
414
+ # Convert re-encoded MP3 to WAV
415
+ wav_file_path = convert_mp3_to_wav(reencoded_mp3_path)
416
+ if not os.path.exists(wav_file_path):
417
+ update_progress(f"Converted WAV file not found: {wav_file_path}")
418
+ return update_progress("Processing failed: Converted WAV file not found"), "", ""
419
+
420
+ temp_files.append(wav_file_path)
421
+
422
+ # Initialize transcription
423
+ transcription = ""
424
+
425
+ if diarize:
426
+ segments = speech_to_text(wav_file_path, whisper_model=whisper_model, diarize=True)
427
+ else:
428
+ segments = speech_to_text(wav_file_path, whisper_model=whisper_model)
429
+
430
+ # Handle segments nested under 'segments' key
431
+ if isinstance(segments, dict) and 'segments' in segments:
432
+ segments = segments['segments']
433
+
434
+ if isinstance(segments, list):
435
+ transcription = " ".join([segment.get('Text', '') for segment in segments])
436
+ else:
437
+ update_progress("Unexpected segments format received from speech_to_text.")
438
+ logging.error(f"Unexpected segments format: {segments}")
439
+
440
+ chunked_text = improved_chunking_process(transcription, chunk_options)
441
+
442
+ if api_name and api_key:
443
+ try:
444
+ summary = perform_summarization(api_name, chunked_text, custom_prompt_input, api_key)
445
+ update_progress("Audio summarized successfully.")
446
+ except Exception as e:
447
+ logging.error(f"Error during summarization: {str(e)}")
448
+ summary = "Summary generation failed"
449
+ else:
450
+ summary = "No summary available (API not provided)"
451
+
452
+ all_transcriptions.append(transcription)
453
+ all_summaries.append(summary)
454
+
455
+ add_media_with_keywords(
456
+ url="Uploaded File",
457
+ title=os.path.basename(wav_file_path),
458
+ media_type='audio',
459
+ content=transcription,
460
+ keywords=custom_keywords,
461
+ prompt=custom_prompt_input,
462
+ summary=summary,
463
+ transcription_model=whisper_model,
464
+ author="Unknown",
465
+ ingestion_date=datetime.now().strftime('%Y-%m-%d')
466
+ )
467
+ update_progress("Uploaded file processed and added to database.")
468
+
469
+ # Final cleanup
470
+ if not keep_original:
471
+ cleanup_files()
472
+
473
+ final_progress = update_progress("All processing complete.")
474
+ final_transcriptions = "\n\n".join(all_transcriptions)
475
+ final_summaries = "\n\n".join(all_summaries)
476
+
477
+ return final_progress, final_transcriptions, final_summaries
478
+
479
+ except Exception as e:
480
+ logging.error(f"Error processing audio files: {str(e)}")
481
+ cleanup_files()
482
+ return update_progress(f"Processing failed: {str(e)}"), "", ""
483
+
484
+
485
+ def download_youtube_audio(url: str) -> str:
486
+ ydl_opts = {
487
+ 'format': 'bestaudio/best',
488
+ 'postprocessors': [{
489
+ 'key': 'FFmpegExtractAudio',
490
+ 'preferredcodec': 'wav',
491
+ 'preferredquality': '192',
492
+ }],
493
+ 'outtmpl': '%(title)s.%(ext)s'
494
+ }
495
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
496
+ info = ydl.extract_info(url, download=True)
497
+ filename = ydl.prepare_filename(info)
498
+ return filename.rsplit('.', 1)[0] + '.wav'
499
+
500
+
501
+ def process_podcast(url, title, author, keywords, custom_prompt, api_name, api_key, whisper_model,
502
+ keep_original=False, enable_diarization=False, use_cookies=False, cookies=None,
503
+ chunk_method=None, max_chunk_size=300, chunk_overlap=0, use_adaptive_chunking=False,
504
+ use_multi_level_chunking=False, chunk_language='english'):
505
+ progress = []
506
+ error_message = ""
507
+ temp_files = []
508
+
509
+ def update_progress(message):
510
+ progress.append(message)
511
+ return "\n".join(progress)
512
+
513
+ def cleanup_files():
514
+ if not keep_original:
515
+ for file in temp_files:
516
+ try:
517
+ if os.path.exists(file):
518
+ os.remove(file)
519
+ update_progress(f"Temporary file {file} removed.")
520
+ except Exception as e:
521
+ update_progress(f"Failed to remove temporary file {file}: {str(e)}")
522
+
523
+ try:
524
+ # Download podcast
525
+ audio_file = download_audio_file(url, use_cookies, cookies)
526
+ temp_files.append(audio_file)
527
+ update_progress("Podcast downloaded successfully.")
528
+
529
+ # Extract metadata
530
+ metadata = extract_metadata(url)
531
+ title = title or metadata.get('title', 'Unknown Podcast')
532
+ author = author or metadata.get('uploader', 'Unknown Author')
533
+
534
+ # Format metadata for storage
535
+ metadata_text = f"""
536
+ Metadata:
537
+ Title: {title}
538
+ Author: {author}
539
+ Series: {metadata.get('series', 'N/A')}
540
+ Episode: {metadata.get('episode', 'N/A')}
541
+ Season: {metadata.get('season', 'N/A')}
542
+ Upload Date: {metadata.get('upload_date', 'N/A')}
543
+ Duration: {metadata.get('duration', 'N/A')} seconds
544
+ Description: {metadata.get('description', 'N/A')}
545
+ """
546
+
547
+ # Update keywords
548
+ new_keywords = []
549
+ if metadata.get('series'):
550
+ new_keywords.append(f"series:{metadata['series']}")
551
+ if metadata.get('episode'):
552
+ new_keywords.append(f"episode:{metadata['episode']}")
553
+ if metadata.get('season'):
554
+ new_keywords.append(f"season:{metadata['season']}")
555
+
556
+ keywords = f"{keywords},{','.join(new_keywords)}" if keywords else ','.join(new_keywords)
557
+
558
+ update_progress(f"Metadata extracted - Title: {title}, Author: {author}, Keywords: {keywords}")
559
+
560
+ # Transcribe the podcast
561
+ try:
562
+ if enable_diarization:
563
+ segments = speech_to_text(audio_file, whisper_model=whisper_model, diarize=True)
564
+ else:
565
+ segments = speech_to_text(audio_file, whisper_model=whisper_model)
566
+ transcription = " ".join([segment['Text'] for segment in segments])
567
+ update_progress("Podcast transcribed successfully.")
568
+ except Exception as e:
569
+ error_message = f"Transcription failed: {str(e)}"
570
+ raise
571
+
572
+ # Apply chunking
573
+ chunk_options = {
574
+ 'method': chunk_method,
575
+ 'max_size': max_chunk_size,
576
+ 'overlap': chunk_overlap,
577
+ 'adaptive': use_adaptive_chunking,
578
+ 'multi_level': use_multi_level_chunking,
579
+ 'language': chunk_language
580
+ }
581
+ chunked_text = improved_chunking_process(transcription, chunk_options)
582
+
583
+ # Combine metadata and transcription
584
+ full_content = metadata_text + "\n\nTranscription:\n" + transcription
585
+
586
+ # Summarize if API is provided
587
+ summary = None
588
+ if api_name and api_key:
589
+ try:
590
+ summary = perform_summarization(api_name, chunked_text, custom_prompt, api_key)
591
+ update_progress("Podcast summarized successfully.")
592
+ except Exception as e:
593
+ error_message = f"Summarization failed: {str(e)}"
594
+ raise
595
+
596
+ # Add to database
597
+ try:
598
+ add_media_with_keywords(
599
+ url=url,
600
+ title=title,
601
+ media_type='podcast',
602
+ content=full_content,
603
+ keywords=keywords,
604
+ prompt=custom_prompt,
605
+ summary=summary or "No summary available",
606
+ transcription_model=whisper_model,
607
+ author=author,
608
+ ingestion_date=datetime.now().strftime('%Y-%m-%d')
609
+ )
610
+ update_progress("Podcast added to database successfully.")
611
+ except Exception as e:
612
+ error_message = f"Error adding podcast to database: {str(e)}"
613
+ raise
614
+
615
+ # Cleanup
616
+ cleanup_files()
617
+
618
+ return (update_progress("Processing complete."), full_content, summary or "No summary generated.",
619
+ title, author, keywords, error_message)
620
+
621
+ except Exception as e:
622
+ logging.error(f"Error processing podcast: {str(e)}")
623
+ cleanup_files()
624
+ return update_progress(f"Processing failed: {str(e)}"), "", "", "", "", "", str(e)
625
+
626
+
627
+ #
628
+ #
629
+ #######################################################################################################################
App_Function_Libraries/Audio_Transcription_Lib.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Audio_Transcription_Lib.py
2
+ #########################################
3
+ # Transcription Library
4
+ # This library is used to perform transcription of audio files.
5
+ # Currently, uses faster_whisper for transcription.
6
+ #
7
+ ####
8
+ import configparser
9
+ ####################
10
+ # Function List
11
+ #
12
+ # 1. convert_to_wav(video_file_path, offset=0, overwrite=False)
13
+ # 2. speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='small.en', vad_filter=False)
14
+ #
15
+ ####################
16
+ #
17
+ # Import necessary libraries to run solo for testing
18
+ import json
19
+ import logging
20
+ import os
21
+ import sys
22
+ import subprocess
23
+ import time
24
+
25
+ # Import Local
26
+ #
27
+ #######################################################################################################################
28
+ # Function Definitions
29
+ #
30
+
31
+ # Convert video .m4a into .wav using ffmpeg
32
+ # ffmpeg -i "example.mp4" -ar 16000 -ac 1 -c:a pcm_s16le "output.wav"
33
+ # https://www.gyan.dev/ffmpeg/builds/
34
+ #
35
+
36
+
37
+ # os.system(r'.\Bin\ffmpeg.exe -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
38
+ def convert_to_wav(video_file_path, offset=0, overwrite=False):
39
+ out_path = os.path.splitext(video_file_path)[0] + ".wav"
40
+
41
+ if os.path.exists(out_path) and not overwrite:
42
+ print(f"File '{out_path}' already exists. Skipping conversion.")
43
+ logging.info(f"Skipping conversion as file already exists: {out_path}")
44
+ return out_path
45
+ print("Starting conversion process of .m4a to .WAV")
46
+ out_path = os.path.splitext(video_file_path)[0] + ".wav"
47
+
48
+ try:
49
+ if os.name == "nt":
50
+ logging.debug("ffmpeg being ran on windows")
51
+
52
+ if sys.platform.startswith('win'):
53
+ ffmpeg_cmd = ".\\Bin\\ffmpeg.exe"
54
+ logging.debug(f"ffmpeg_cmd: {ffmpeg_cmd}")
55
+ else:
56
+ ffmpeg_cmd = 'ffmpeg' # Assume 'ffmpeg' is in PATH for non-Windows systems
57
+
58
+ command = [
59
+ ffmpeg_cmd, # Assuming the working directory is correctly set where .\Bin exists
60
+ "-ss", "00:00:00", # Start at the beginning of the video
61
+ "-i", video_file_path,
62
+ "-ar", "16000", # Audio sample rate
63
+ "-ac", "1", # Number of audio channels
64
+ "-c:a", "pcm_s16le", # Audio codec
65
+ out_path
66
+ ]
67
+ try:
68
+ # Redirect stdin from null device to prevent ffmpeg from waiting for input
69
+ with open(os.devnull, 'rb') as null_file:
70
+ result = subprocess.run(command, stdin=null_file, text=True, capture_output=True)
71
+ if result.returncode == 0:
72
+ logging.info("FFmpeg executed successfully")
73
+ logging.debug("FFmpeg output: %s", result.stdout)
74
+ else:
75
+ logging.error("Error in running FFmpeg")
76
+ logging.error("FFmpeg stderr: %s", result.stderr)
77
+ raise RuntimeError(f"FFmpeg error: {result.stderr}")
78
+ except Exception as e:
79
+ logging.error("Error occurred - ffmpeg doesn't like windows")
80
+ raise RuntimeError("ffmpeg failed")
81
+ elif os.name == "posix":
82
+ os.system(f'ffmpeg -ss 00:00:00 -i "{video_file_path}" -ar 16000 -ac 1 -c:a pcm_s16le "{out_path}"')
83
+ else:
84
+ raise RuntimeError("Unsupported operating system")
85
+ logging.info("Conversion to WAV completed: %s", out_path)
86
+ except subprocess.CalledProcessError as e:
87
+ logging.error("Error executing FFmpeg command: %s", str(e))
88
+ raise RuntimeError("Error converting video file to WAV")
89
+ except Exception as e:
90
+ logging.error("speech-to-text: Error transcribing audio: %s", str(e))
91
+ return {"error": str(e)}
92
+ return out_path
93
+
94
+
95
+ # Transcribe .wav into .segments.json
96
+ def speech_to_text(audio_file_path, selected_source_lang='en', whisper_model='medium.en', vad_filter=False, diarize=False):
97
+ logging.info('speech-to-text: Loading faster_whisper model: %s', whisper_model)
98
+ from faster_whisper import WhisperModel
99
+ # Retrieve processing choice from the configuration file
100
+ config = configparser.ConfigParser()
101
+ config.read('config.txt')
102
+ processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
103
+ model = WhisperModel(whisper_model, device=f"{processing_choice}")
104
+ time_start = time.time()
105
+ if audio_file_path is None:
106
+ raise ValueError("speech-to-text: No audio file provided")
107
+ logging.info("speech-to-text: Audio file path: %s", audio_file_path)
108
+
109
+ try:
110
+ _, file_ending = os.path.splitext(audio_file_path)
111
+ out_file = audio_file_path.replace(file_ending, ".segments.json")
112
+ prettified_out_file = audio_file_path.replace(file_ending, ".segments_pretty.json")
113
+ if os.path.exists(out_file):
114
+ logging.info("speech-to-text: Segments file already exists: %s", out_file)
115
+ with open(out_file) as f:
116
+ global segments
117
+ segments = json.load(f)
118
+ return segments
119
+
120
+ logging.info('speech-to-text: Starting transcription...')
121
+ options = dict(language=selected_source_lang, beam_size=5, best_of=5, vad_filter=vad_filter)
122
+ transcribe_options = dict(task="transcribe", **options)
123
+ segments_raw, info = model.transcribe(audio_file_path, **transcribe_options)
124
+
125
+ segments = []
126
+ for segment_chunk in segments_raw:
127
+ chunk = {
128
+ "Time_Start": segment_chunk.start,
129
+ "Time_End": segment_chunk.end,
130
+ "Text": segment_chunk.text
131
+ }
132
+ logging.debug("Segment: %s", chunk)
133
+ segments.append(chunk)
134
+ if not segments:
135
+ raise RuntimeError("No transcription produced. The audio file may be invalid or empty.")
136
+ logging.info("speech-to-text: Transcription completed in %.2f seconds", time.time() - time_start)
137
+
138
+ # Create a dictionary with the 'segments' key
139
+ output_data = {'segments': segments}
140
+
141
+ # Save prettified JSON
142
+ logging.info("speech-to-text: Saving prettified JSON to %s", prettified_out_file)
143
+ with open(prettified_out_file, 'w') as f:
144
+ json.dump(output_data, f, indent=2)
145
+
146
+ # Save non-prettified JSON
147
+ logging.info("speech-to-text: Saving JSON to %s", out_file)
148
+ with open(out_file, 'w') as f:
149
+ json.dump(output_data, f)
150
+
151
+ except Exception as e:
152
+ logging.error("speech-to-text: Error transcribing audio: %s", str(e))
153
+ raise RuntimeError("speech-to-text: Error transcribing audio")
154
+ return segments
155
+
156
+ #
157
+ #
158
+ #######################################################################################################################
App_Function_Libraries/Book_Ingestion_Lib.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Book_Ingestion_Lib.py
2
+ #########################################
3
+ # Library to hold functions for ingesting book files.#
4
+ #
5
+ ####################
6
+ # Function List
7
+ #
8
+ # 1. ingest_text_file(file_path, title=None, author=None, keywords=None):
9
+ # 2.
10
+ #
11
+ #
12
+ ####################
13
+
14
+
15
+ # Import necessary libraries
16
+ import os
17
+ import re
18
+ from datetime import datetime
19
+ import logging
20
+
21
+
22
+ # Import Local
23
+ from SQLite_DB import add_media_with_keywords
24
+
25
+ #######################################################################################################################
26
+ # Function Definitions
27
+ #
28
+
29
+ # Ingest a text file into the database with Title/Author/Keywords
30
+
31
+ def extract_epub_metadata(content):
32
+ title_match = re.search(r'Title:\s*(.*?)\n', content)
33
+ author_match = re.search(r'Author:\s*(.*?)\n', content)
34
+
35
+ title = title_match.group(1) if title_match else None
36
+ author = author_match.group(1) if author_match else None
37
+
38
+ return title, author
39
+
40
+
41
+ def ingest_text_file(file_path, title=None, author=None, keywords=None):
42
+ try:
43
+ with open(file_path, 'r', encoding='utf-8') as file:
44
+ content = file.read()
45
+
46
+ # Check if it's a converted epub and extract metadata if so
47
+ if 'epub_converted' in (keywords or ''):
48
+ extracted_title, extracted_author = extract_epub_metadata(content)
49
+ title = title or extracted_title
50
+ author = author or extracted_author
51
+
52
+ # If title is still not provided, use the filename without extension
53
+ if not title:
54
+ title = os.path.splitext(os.path.basename(file_path))[0]
55
+
56
+ # If author is still not provided, set it to 'Unknown'
57
+ if not author:
58
+ author = 'Unknown'
59
+
60
+ # If keywords are not provided, use a default keyword
61
+ if not keywords:
62
+ keywords = 'text_file,epub_converted'
63
+ else:
64
+ keywords = f'text_file,epub_converted,{keywords}'
65
+
66
+ # Add the text file to the database
67
+ add_media_with_keywords(
68
+ url=file_path,
69
+ title=title,
70
+ media_type='document',
71
+ content=content,
72
+ keywords=keywords,
73
+ prompt='No prompt for text files',
74
+ summary='No summary for text files',
75
+ transcription_model='None',
76
+ author=author,
77
+ ingestion_date=datetime.now().strftime('%Y-%m-%d')
78
+ )
79
+
80
+ return f"Text file '{title}' by {author} ingested successfully."
81
+ except Exception as e:
82
+ logging.error(f"Error ingesting text file: {str(e)}")
83
+ return f"Error ingesting text file: {str(e)}"
84
+
85
+
86
+ def ingest_folder(folder_path, keywords=None):
87
+ results = []
88
+ for filename in os.listdir(folder_path):
89
+ if filename.lower().endswith('.txt'):
90
+ file_path = os.path.join(folder_path, filename)
91
+ result = ingest_text_file(file_path, keywords=keywords)
92
+ results.append(result)
93
+
94
+
95
+
App_Function_Libraries/Chunk_Lib.py ADDED
@@ -0,0 +1,467 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Chunk_Lib.py
2
+ #########################################
3
+ # Chunking Library
4
+ # This library is used to perform chunking of input files.
5
+ # Currently, uses naive approaches. Nothing fancy.
6
+ #
7
+ ####
8
+ # Import necessary libraries
9
+ import logging
10
+ import re
11
+
12
+ from typing import List, Optional, Tuple, Dict, Any
13
+
14
+ from openai import OpenAI
15
+ from tqdm import tqdm
16
+ #
17
+ # Import 3rd party
18
+ from transformers import GPT2Tokenizer
19
+ import nltk
20
+ from nltk.tokenize import sent_tokenize, word_tokenize
21
+ from sklearn.feature_extraction.text import TfidfVectorizer
22
+ from sklearn.metrics.pairwise import cosine_similarity
23
+ #
24
+ # Import Local
25
+ from App_Function_Libraries.Tokenization_Methods_Lib import openai_tokenize
26
+ from App_Function_Libraries.Utils import load_comprehensive_config
27
+
28
+
29
+ #
30
+ #######################################################################################################################
31
+ # Function Definitions
32
+ #
33
+
34
+ # FIXME - Make sure it only downloads if it already exists, and does a check first.
35
+ # Ensure NLTK data is downloaded
36
+ def ntlk_prep():
37
+ nltk.download('punkt')
38
+
39
+ # Load GPT2 tokenizer
40
+ tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
41
+
42
+ # Load Config file for API keys
43
+ config = load_comprehensive_config()
44
+ openai_api_key = config.get('API', 'openai_api_key', fallback=None)
45
+
46
+ def load_document(file_path):
47
+ with open(file_path, 'r') as file:
48
+ text = file.read()
49
+ return re.sub('\\s+', ' ', text).strip()
50
+
51
+
52
+ def improved_chunking_process(text: str, chunk_options: Dict[str, Any]) -> List[Dict[str, Any]]:
53
+ chunk_method = chunk_options.get('method', 'words')
54
+ max_chunk_size = chunk_options.get('max_size', 300)
55
+ overlap = chunk_options.get('overlap', 0)
56
+ language = chunk_options.get('language', 'english')
57
+ adaptive = chunk_options.get('adaptive', False)
58
+ multi_level = chunk_options.get('multi_level', False)
59
+
60
+ if adaptive:
61
+ max_chunk_size = adaptive_chunk_size(text, max_chunk_size)
62
+
63
+ if multi_level:
64
+ chunks = multi_level_chunking(text, chunk_method, max_chunk_size, overlap, language)
65
+ else:
66
+ if chunk_method == 'words':
67
+ chunks = chunk_text_by_words(text, max_chunk_size, overlap)
68
+ elif chunk_method == 'sentences':
69
+ chunks = chunk_text_by_sentences(text, max_chunk_size, overlap, language)
70
+ elif chunk_method == 'paragraphs':
71
+ chunks = chunk_text_by_paragraphs(text, max_chunk_size, overlap)
72
+ elif chunk_method == 'tokens':
73
+ chunks = chunk_text_by_tokens(text, max_chunk_size, overlap)
74
+ else:
75
+ chunks = [text] # No chunking applied
76
+
77
+ return [{'text': chunk, 'metadata': get_chunk_metadata(chunk, text)} for chunk in chunks]
78
+
79
+
80
+ def adaptive_chunk_size(text: str, base_size: int) -> int:
81
+ # Simple adaptive logic: adjust chunk size based on text complexity
82
+ avg_word_length = sum(len(word) for word in text.split()) / len(text.split())
83
+ if avg_word_length > 6: # Arbitrary threshold for "complex" text
84
+ return int(base_size * 0.8) # Reduce chunk size for complex text
85
+ return base_size
86
+
87
+
88
+ def multi_level_chunking(text: str, method: str, max_size: int, overlap: int, language: str) -> List[str]:
89
+ # First level: chunk by paragraphs
90
+ paragraphs = chunk_text_by_paragraphs(text, max_size * 2, overlap)
91
+
92
+ # Second level: chunk each paragraph further
93
+ chunks = []
94
+ for para in paragraphs:
95
+ if method == 'words':
96
+ chunks.extend(chunk_text_by_words(para, max_size, overlap))
97
+ elif method == 'sentences':
98
+ chunks.extend(chunk_text_by_sentences(para, max_size, overlap, language))
99
+ else:
100
+ chunks.append(para)
101
+
102
+ return chunks
103
+
104
+
105
+ def chunk_text_by_words(text: str, max_words: int = 300, overlap: int = 0) -> List[str]:
106
+ words = text.split()
107
+ chunks = []
108
+ for i in range(0, len(words), max_words - overlap):
109
+ chunk = ' '.join(words[i:i + max_words])
110
+ chunks.append(chunk)
111
+ return post_process_chunks(chunks)
112
+
113
+
114
+ def chunk_text_by_sentences(text: str, max_sentences: int = 10, overlap: int = 0, language: str = 'english') -> List[
115
+ str]:
116
+ nltk.download('punkt', quiet=True)
117
+ sentences = nltk.sent_tokenize(text, language=language)
118
+ chunks = []
119
+ for i in range(0, len(sentences), max_sentences - overlap):
120
+ chunk = ' '.join(sentences[i:i + max_sentences])
121
+ chunks.append(chunk)
122
+ return post_process_chunks(chunks)
123
+
124
+
125
+ def chunk_text_by_paragraphs(text: str, max_paragraphs: int = 5, overlap: int = 0) -> List[str]:
126
+ paragraphs = re.split(r'\n\s*\n', text)
127
+ chunks = []
128
+ for i in range(0, len(paragraphs), max_paragraphs - overlap):
129
+ chunk = '\n\n'.join(paragraphs[i:i + max_paragraphs])
130
+ chunks.append(chunk)
131
+ return post_process_chunks(chunks)
132
+
133
+
134
+ def chunk_text_by_tokens(text: str, max_tokens: int = 1000, overlap: int = 0) -> List[str]:
135
+ # This is a simplified token-based chunking. For more accurate tokenization,
136
+ # consider using a proper tokenizer like GPT-2 TokenizerFast
137
+ words = text.split()
138
+ chunks = []
139
+ current_chunk = []
140
+ current_token_count = 0
141
+
142
+ for word in words:
143
+ word_token_count = len(word) // 4 + 1 # Rough estimate of token count
144
+ if current_token_count + word_token_count > max_tokens and current_chunk:
145
+ chunks.append(' '.join(current_chunk))
146
+ current_chunk = current_chunk[-overlap:] if overlap > 0 else []
147
+ current_token_count = sum(len(w) // 4 + 1 for w in current_chunk)
148
+
149
+ current_chunk.append(word)
150
+ current_token_count += word_token_count
151
+
152
+ if current_chunk:
153
+ chunks.append(' '.join(current_chunk))
154
+
155
+ return post_process_chunks(chunks)
156
+
157
+
158
+ def post_process_chunks(chunks: List[str]) -> List[str]:
159
+ return [chunk.strip() for chunk in chunks if chunk.strip()]
160
+
161
+
162
+ def get_chunk_metadata(chunk: str, full_text: str) -> Dict[str, Any]:
163
+ start_index = full_text.index(chunk)
164
+ return {
165
+ 'start_index': start_index,
166
+ 'end_index': start_index + len(chunk),
167
+ 'word_count': len(chunk.split()),
168
+ 'char_count': len(chunk)
169
+ }
170
+
171
+
172
+ # Hybrid approach, chunk each sentence while ensuring total token size does not exceed a maximum number
173
+ def chunk_text_hybrid(text, max_tokens=1000):
174
+ sentences = nltk.tokenize.sent_tokenize(text)
175
+ chunks = []
176
+ current_chunk = []
177
+ current_length = 0
178
+
179
+ for sentence in sentences:
180
+ tokens = tokenizer.encode(sentence)
181
+ if current_length + len(tokens) <= max_tokens:
182
+ current_chunk.append(sentence)
183
+ current_length += len(tokens)
184
+ else:
185
+ chunks.append(' '.join(current_chunk))
186
+ current_chunk = [sentence]
187
+ current_length = len(tokens)
188
+
189
+ if current_chunk:
190
+ chunks.append(' '.join(current_chunk))
191
+
192
+ return chunks
193
+
194
+ # Thanks openai
195
+ def chunk_on_delimiter(input_string: str,
196
+ max_tokens: int,
197
+ delimiter: str) -> List[str]:
198
+ chunks = input_string.split(delimiter)
199
+ combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum(
200
+ chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True)
201
+ if dropped_chunk_count > 0:
202
+ print(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.")
203
+ combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks]
204
+ return combined_chunks
205
+
206
+
207
+ def recursive_summarize_chunks(chunks, summarize_func, custom_prompt):
208
+ summarized_chunks = []
209
+ current_summary = ""
210
+
211
+ for i, chunk in enumerate(chunks):
212
+ if i == 0:
213
+ current_summary = summarize_func(chunk, custom_prompt)
214
+ else:
215
+ combined_text = current_summary + "\n\n" + chunk
216
+ current_summary = summarize_func(combined_text, custom_prompt)
217
+
218
+ summarized_chunks.append(current_summary)
219
+
220
+ return summarized_chunks
221
+
222
+
223
+ # Sample text for testing
224
+ sample_text = """
225
+ Natural language processing (NLP) is a subfield of linguistics, computer science, and artificial intelligence
226
+ concerned with the interactions between computers and human language, in particular how to program computers
227
+ to process and analyze large amounts of natural language data. The result is a computer capable of "understanding"
228
+ the contents of documents, including the contextual nuances of the language within them. The technology can then
229
+ accurately extract information and insights contained in the documents as well as categorize and organize the documents themselves.
230
+
231
+ Challenges in natural language processing frequently involve speech recognition, natural language understanding,
232
+ and natural language generation.
233
+
234
+ Natural language processing has its roots in the 1950s. Already in 1950, Alan Turing published an article titled
235
+ "Computing Machinery and Intelligence" which proposed what is now called the Turing test as a criterion of intelligence.
236
+ """
237
+
238
+ # Example usage of different chunking methods
239
+ # print("Chunking by words:")
240
+ # print(chunk_text_by_words(sample_text, max_words=50))
241
+ #
242
+ # print("\nChunking by sentences:")
243
+ # print(chunk_text_by_sentences(sample_text, max_sentences=2))
244
+ #
245
+ # print("\nChunking by paragraphs:")
246
+ # print(chunk_text_by_paragraphs(sample_text, max_paragraphs=1))
247
+ #
248
+ # print("\nChunking by tokens:")
249
+ # print(chunk_text_by_tokens(sample_text, max_tokens=50))
250
+ #
251
+ # print("\nHybrid chunking:")
252
+ # print(chunk_text_hybrid(sample_text, max_tokens=50))
253
+
254
+
255
+
256
+ #######################################################################################################################
257
+ #
258
+ # Experimental Semantic Chunking
259
+ #
260
+
261
+ # Chunk text into segments based on semantic similarity
262
+ def count_units(text, unit='tokens'):
263
+ if unit == 'words':
264
+ return len(text.split())
265
+ elif unit == 'tokens':
266
+ return len(word_tokenize(text))
267
+ elif unit == 'characters':
268
+ return len(text)
269
+ else:
270
+ raise ValueError("Invalid unit. Choose 'words', 'tokens', or 'characters'.")
271
+
272
+
273
+ def semantic_chunking(text, max_chunk_size=2000, unit='words'):
274
+ nltk.download('punkt', quiet=True)
275
+ sentences = sent_tokenize(text)
276
+ vectorizer = TfidfVectorizer()
277
+ sentence_vectors = vectorizer.fit_transform(sentences)
278
+
279
+ chunks = []
280
+ current_chunk = []
281
+ current_size = 0
282
+
283
+ for i, sentence in enumerate(sentences):
284
+ sentence_size = count_units(sentence, unit)
285
+ if current_size + sentence_size > max_chunk_size and current_chunk:
286
+ chunks.append(' '.join(current_chunk))
287
+ overlap_size = count_units(' '.join(current_chunk[-3:]), unit) # Use last 3 sentences for overlap
288
+ current_chunk = current_chunk[-3:] # Keep last 3 sentences for overlap
289
+ current_size = overlap_size
290
+
291
+ current_chunk.append(sentence)
292
+ current_size += sentence_size
293
+
294
+ if i + 1 < len(sentences):
295
+ current_vector = sentence_vectors[i]
296
+ next_vector = sentence_vectors[i + 1]
297
+ similarity = cosine_similarity(current_vector, next_vector)[0][0]
298
+ if similarity < 0.5 and current_size >= max_chunk_size // 2:
299
+ chunks.append(' '.join(current_chunk))
300
+ overlap_size = count_units(' '.join(current_chunk[-3:]), unit)
301
+ current_chunk = current_chunk[-3:]
302
+ current_size = overlap_size
303
+
304
+ if current_chunk:
305
+ chunks.append(' '.join(current_chunk))
306
+
307
+ return chunks
308
+
309
+
310
+ def semantic_chunk_long_file(file_path, max_chunk_size=1000, overlap=100):
311
+ try:
312
+ with open(file_path, 'r', encoding='utf-8') as file:
313
+ content = file.read()
314
+
315
+ chunks = semantic_chunking(content, max_chunk_size, overlap)
316
+ return chunks
317
+ except Exception as e:
318
+ logging.error(f"Error chunking text file: {str(e)}")
319
+ return None
320
+ #######################################################################################################################
321
+
322
+
323
+
324
+
325
+
326
+
327
+ #######################################################################################################################
328
+ #
329
+ # OpenAI Rolling Summarization
330
+ #
331
+
332
+ client = OpenAI(api_key=openai_api_key)
333
+ def get_chat_completion(messages, model='gpt-4-turbo'):
334
+ response = client.chat.completions.create(
335
+ model=model,
336
+ messages=messages,
337
+ temperature=0,
338
+ )
339
+ return response.choices[0].message.content
340
+
341
+
342
+ # This function combines text chunks into larger blocks without exceeding a specified token count.
343
+ # It returns the combined chunks, their original indices, and the number of dropped chunks due to overflow.
344
+ def combine_chunks_with_no_minimum(
345
+ chunks: List[str],
346
+ max_tokens: int,
347
+ chunk_delimiter="\n\n",
348
+ header: Optional[str] = None,
349
+ add_ellipsis_for_overflow=False,
350
+ ) -> Tuple[List[str], List[int]]:
351
+ dropped_chunk_count = 0
352
+ output = [] # list to hold the final combined chunks
353
+ output_indices = [] # list to hold the indices of the final combined chunks
354
+ candidate = (
355
+ [] if header is None else [header]
356
+ ) # list to hold the current combined chunk candidate
357
+ candidate_indices = []
358
+ for chunk_i, chunk in enumerate(chunks):
359
+ chunk_with_header = [chunk] if header is None else [header, chunk]
360
+ # FIXME MAKE NOT OPENAI SPECIFIC
361
+ if len(openai_tokenize(chunk_delimiter.join(chunk_with_header))) > max_tokens:
362
+ print(f"warning: chunk overflow")
363
+ if (
364
+ add_ellipsis_for_overflow
365
+ # FIXME MAKE NOT OPENAI SPECIFIC
366
+ and len(openai_tokenize(chunk_delimiter.join(candidate + ["..."]))) <= max_tokens
367
+ ):
368
+ candidate.append("...")
369
+ dropped_chunk_count += 1
370
+ continue # this case would break downstream assumptions
371
+ # estimate token count with the current chunk added
372
+ # FIXME MAKE NOT OPENAI SPECIFIC
373
+ extended_candidate_token_count = len(openai_tokenize(chunk_delimiter.join(candidate + [chunk])))
374
+ # If the token count exceeds max_tokens, add the current candidate to output and start a new candidate
375
+ if extended_candidate_token_count > max_tokens:
376
+ output.append(chunk_delimiter.join(candidate))
377
+ output_indices.append(candidate_indices)
378
+ candidate = chunk_with_header # re-initialize candidate
379
+ candidate_indices = [chunk_i]
380
+ # otherwise keep extending the candidate
381
+ else:
382
+ candidate.append(chunk)
383
+ candidate_indices.append(chunk_i)
384
+ # add the remaining candidate to output if it's not empty
385
+ if (header is not None and len(candidate) > 1) or (header is None and len(candidate) > 0):
386
+ output.append(chunk_delimiter.join(candidate))
387
+ output_indices.append(candidate_indices)
388
+ return output, output_indices, dropped_chunk_count
389
+
390
+
391
+ def rolling_summarize(text: str,
392
+ detail: float = 0,
393
+ model: str = 'gpt-4-turbo',
394
+ additional_instructions: Optional[str] = None,
395
+ minimum_chunk_size: Optional[int] = 500,
396
+ chunk_delimiter: str = ".",
397
+ summarize_recursively=False,
398
+ verbose=False):
399
+ """
400
+ Summarizes a given text by splitting it into chunks, each of which is summarized individually.
401
+ The level of detail in the summary can be adjusted, and the process can optionally be made recursive.
402
+
403
+ Parameters:
404
+ - text (str): The text to be summarized.
405
+ - detail (float, optional): A value between 0 and 1
406
+ indicating the desired level of detail in the summary. 0 leads to a higher level summary, and 1 results in a more
407
+ detailed summary. Defaults to 0.
408
+ - additional_instructions (Optional[str], optional): Additional instructions to provide to the
409
+ model for customizing summaries. - minimum_chunk_size (Optional[int], optional): The minimum size for text
410
+ chunks. Defaults to 500.
411
+ - chunk_delimiter (str, optional): The delimiter used to split the text into chunks. Defaults to ".".
412
+ - summarize_recursively (bool, optional): If True, summaries are generated recursively, using previous summaries for context.
413
+ - verbose (bool, optional): If True, prints detailed information about the chunking process.
414
+ Returns:
415
+ - str: The final compiled summary of the text.
416
+
417
+ The function first determines the number of chunks by interpolating between a minimum and a maximum chunk count
418
+ based on the `detail` parameter. It then splits the text into chunks and summarizes each chunk. If
419
+ `summarize_recursively` is True, each summary is based on the previous summaries, adding more context to the
420
+ summarization process. The function returns a compiled summary of all chunks.
421
+ """
422
+
423
+ # check detail is set correctly
424
+ assert 0 <= detail <= 1
425
+
426
+ # interpolate the number of chunks based to get specified level of detail
427
+ max_chunks = len(chunk_on_delimiter(text, minimum_chunk_size, chunk_delimiter))
428
+ min_chunks = 1
429
+ num_chunks = int(min_chunks + detail * (max_chunks - min_chunks))
430
+
431
+ # adjust chunk_size based on interpolated number of chunks
432
+ # FIXME MAKE NOT OPENAI SPECIFIC
433
+ document_length = len(openai_tokenize(text))
434
+ chunk_size = max(minimum_chunk_size, document_length // num_chunks)
435
+ text_chunks = chunk_on_delimiter(text, chunk_size, chunk_delimiter)
436
+ if verbose:
437
+ print(f"Splitting the text into {len(text_chunks)} chunks to be summarized.")
438
+ # FIXME MAKE NOT OPENAI SPECIFIC
439
+ print(f"Chunk lengths are {[len(openai_tokenize(x)) for x in text_chunks]}")
440
+
441
+ # set system message - FIXME
442
+ system_message_content = "Rewrite this text in summarized form."
443
+ if additional_instructions is not None:
444
+ system_message_content += f"\n\n{additional_instructions}"
445
+
446
+ accumulated_summaries = []
447
+ for i, chunk in enumerate(tqdm(text_chunks)):
448
+ if summarize_recursively and accumulated_summaries:
449
+ # Combine previous summary with current chunk for recursive summarization
450
+ combined_text = accumulated_summaries[-1] + "\n\n" + chunk
451
+ user_message_content = f"Previous summary and new content to summarize:\n\n{combined_text}"
452
+ else:
453
+ user_message_content = chunk
454
+
455
+ messages = [
456
+ {"role": "system", "content": system_message_content},
457
+ {"role": "user", "content": user_message_content}
458
+ ]
459
+
460
+ response = get_chat_completion(messages, model=model)
461
+ accumulated_summaries.append(response)
462
+
463
+ final_summary = '\n\n'.join(accumulated_summaries)
464
+ return final_summary
465
+
466
+
467
+
App_Function_Libraries/Diarization_Lib.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Diarization_Lib.py
2
+ #########################################
3
+ # Diarization Library
4
+ # This library is used to perform diarization of audio files.
5
+ # Currently, uses FIXME for transcription.
6
+ #
7
+ ####################
8
+ ####################
9
+ # Function List
10
+ #
11
+ # 1. speaker_diarize(video_file_path, segments, embedding_model = "pyannote/embedding", embedding_size=512, num_speakers=0)
12
+ #
13
+ ####################
14
+ # Import necessary libraries
15
+ import configparser
16
+ import json
17
+ import logging
18
+ import os
19
+ from pathlib import Path
20
+ import time
21
+ # Import Local
22
+ from App_Function_Libraries.Audio_Transcription_Lib import speech_to_text
23
+ #
24
+ # Import 3rd Party
25
+ from pyannote.audio import Model
26
+ from pyannote.audio.pipelines.speaker_diarization import SpeakerDiarization
27
+ import torch
28
+ import yaml
29
+ #
30
+ #######################################################################################################################
31
+ # Function Definitions
32
+ #
33
+
34
+ def load_pipeline_from_pretrained(path_to_config: str | Path) -> SpeakerDiarization:
35
+ path_to_config = Path(path_to_config).resolve()
36
+ print(f"Loading pyannote pipeline from {path_to_config}...")
37
+
38
+ if not path_to_config.exists():
39
+ raise FileNotFoundError(f"Config file not found: {path_to_config}")
40
+
41
+ # Load the YAML configuration
42
+ with open(path_to_config, 'r') as config_file:
43
+ config = yaml.safe_load(config_file)
44
+
45
+ # Store current working directory
46
+ cwd = Path.cwd().resolve()
47
+
48
+ # Change to the directory containing the config file
49
+ cd_to = path_to_config.parent.resolve()
50
+ print(f"Changing working directory to {cd_to}")
51
+ os.chdir(cd_to)
52
+
53
+ try:
54
+ # Create a SpeakerDiarization pipeline
55
+ pipeline = SpeakerDiarization()
56
+
57
+ # Load models explicitly from local paths
58
+ embedding_path = Path(config['pipeline']['params']['embedding']).resolve()
59
+ segmentation_path = Path(config['pipeline']['params']['segmentation']).resolve()
60
+
61
+ if not embedding_path.exists():
62
+ raise FileNotFoundError(f"Embedding model file not found: {embedding_path}")
63
+ if not segmentation_path.exists():
64
+ raise FileNotFoundError(f"Segmentation model file not found: {segmentation_path}")
65
+
66
+ # Load the models from local paths using pyannote's Model class
67
+ pipeline.embedding = Model.from_pretrained(str(embedding_path), map_location=torch.device('cpu'))
68
+ pipeline.segmentation = Model.from_pretrained(str(segmentation_path), map_location=torch.device('cpu'))
69
+
70
+ # Set other parameters
71
+ pipeline.clustering = config['pipeline']['params']['clustering']
72
+ pipeline.embedding_batch_size = config['pipeline']['params']['embedding_batch_size']
73
+ pipeline.embedding_exclude_overlap = config['pipeline']['params']['embedding_exclude_overlap']
74
+ pipeline.segmentation_batch_size = config['pipeline']['params']['segmentation_batch_size']
75
+
76
+ # Set additional parameters
77
+ pipeline.instantiate(config['params'])
78
+
79
+ finally:
80
+ # Change back to the original working directory
81
+ print(f"Changing working directory back to {cwd}")
82
+ os.chdir(cwd)
83
+
84
+ return pipeline
85
+
86
+ def audio_diarization(audio_file_path):
87
+ logging.info('audio-diarization: Loading pyannote pipeline')
88
+ config = configparser.ConfigParser()
89
+ config.read('config.txt')
90
+ processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
91
+
92
+ base_dir = Path(__file__).parent.resolve()
93
+ config_path = base_dir / 'models' / 'config.yaml'
94
+ pipeline = load_pipeline_from_pretrained(config_path)
95
+
96
+ time_start = time.time()
97
+ if audio_file_path is None:
98
+ raise ValueError("audio-diarization: No audio file provided")
99
+ logging.info("audio-diarization: Audio file path: %s", audio_file_path)
100
+
101
+ try:
102
+ _, file_ending = os.path.splitext(audio_file_path)
103
+ out_file = audio_file_path.replace(file_ending, ".diarization.json")
104
+ prettified_out_file = audio_file_path.replace(file_ending, ".diarization_pretty.json")
105
+ if os.path.exists(out_file):
106
+ logging.info("audio-diarization: Diarization file already exists: %s", out_file)
107
+ with open(out_file) as f:
108
+ global diarization_result
109
+ diarization_result = json.load(f)
110
+ return diarization_result
111
+
112
+ logging.info('audio-diarization: Starting diarization...')
113
+ diarization_result = pipeline(audio_file_path)
114
+
115
+ segments = []
116
+ for turn, _, speaker in diarization_result.itertracks(yield_label=True):
117
+ chunk = {
118
+ "Time_Start": turn.start,
119
+ "Time_End": turn.end,
120
+ "Speaker": speaker
121
+ }
122
+ logging.debug("Segment: %s", chunk)
123
+ segments.append(chunk)
124
+ logging.info("audio-diarization: Diarization completed with pyannote")
125
+
126
+ output_data = {'segments': segments}
127
+
128
+ logging.info("audio-diarization: Saving prettified JSON to %s", prettified_out_file)
129
+ with open(prettified_out_file, 'w') as f:
130
+ json.dump(output_data, f, indent=2)
131
+
132
+ logging.info("audio-diarization: Saving JSON to %s", out_file)
133
+ with open(out_file, 'w') as f:
134
+ json.dump(output_data, f)
135
+
136
+ except Exception as e:
137
+ logging.error("audio-diarization: Error performing diarization: %s", str(e))
138
+ raise RuntimeError("audio-diarization: Error performing diarization")
139
+ return segments
140
+
141
+ def combine_transcription_and_diarization(audio_file_path):
142
+ logging.info('combine-transcription-and-diarization: Starting transcription and diarization...')
143
+
144
+ transcription_result = speech_to_text(audio_file_path)
145
+
146
+ diarization_result = audio_diarization(audio_file_path)
147
+
148
+ combined_result = []
149
+ for transcription_segment in transcription_result:
150
+ for diarization_segment in diarization_result:
151
+ if transcription_segment['Time_Start'] >= diarization_segment['Time_Start'] and transcription_segment[
152
+ 'Time_End'] <= diarization_segment['Time_End']:
153
+ combined_segment = {
154
+ "Time_Start": transcription_segment['Time_Start'],
155
+ "Time_End": transcription_segment['Time_End'],
156
+ "Speaker": diarization_segment['Speaker'],
157
+ "Text": transcription_segment['Text']
158
+ }
159
+ combined_result.append(combined_segment)
160
+ break
161
+
162
+ _, file_ending = os.path.splitext(audio_file_path)
163
+ out_file = audio_file_path.replace(file_ending, ".combined.json")
164
+ prettified_out_file = audio_file_path.replace(file_ending, ".combined_pretty.json")
165
+
166
+ logging.info("combine-transcription-and-diarization: Saving prettified JSON to %s", prettified_out_file)
167
+ with open(prettified_out_file, 'w') as f:
168
+ json.dump(combined_result, f, indent=2)
169
+
170
+ logging.info("combine-transcription-and-diarization: Saving JSON to %s", out_file)
171
+ with open(out_file, 'w') as f:
172
+ json.dump(combined_result, f)
173
+
174
+ return combined_result
175
+ #
176
+ #
177
+ #######################################################################################################################
App_Function_Libraries/Gradio_Related.py ADDED
The diff for this file is too large to render. See raw diff
 
App_Function_Libraries/LLM_API_Calls.py ADDED
@@ -0,0 +1,633 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Summarization_General_Lib.py
2
+ #########################################
3
+ # General Summarization Library
4
+ # This library is used to perform summarization.
5
+ #
6
+ ####
7
+ ####################
8
+ # Function List
9
+ #
10
+ # 1. extract_text_from_segments(segments: List[Dict]) -> str
11
+ # 2. chat_with_openai(api_key, file_path, custom_prompt_arg)
12
+ # 3. chat_with_anthropic(api_key, file_path, model, custom_prompt_arg, max_retries=3, retry_delay=5)
13
+ # 4. chat_with_cohere(api_key, file_path, model, custom_prompt_arg)
14
+ # 5. chat_with_groq(api_key, input_data, custom_prompt_arg, system_prompt=None):
15
+ # 6. chat_with_openrouter(api_key, input_data, custom_prompt_arg, system_prompt=None)
16
+ # 7. chat_with_huggingface(api_key, input_data, custom_prompt_arg, system_prompt=None)
17
+ # 8. chat_with_deepseek(api_key, input_data, custom_prompt_arg, system_prompt=None)
18
+ # 9. chat_with_vllm(input_data, custom_prompt_input, api_key=None, vllm_api_url="http://127.0.0.1:8000/v1/chat/completions", system_prompt=None)
19
+ #
20
+ #
21
+ ####################
22
+ import json
23
+ # Import necessary libraries
24
+ import os
25
+ import logging
26
+ import time
27
+ import requests
28
+ import configparser
29
+ # Import 3rd-Party Libraries
30
+ from openai import OpenAI
31
+ from requests import RequestException
32
+ # Import Local libraries
33
+ from App_Function_Libraries.Local_Summarization_Lib import openai_api_key, client
34
+ from App_Function_Libraries.Utils import load_and_log_configs
35
+ #
36
+ #######################################################################################################################
37
+ # Function Definitions
38
+ #
39
+
40
+ def extract_text_from_segments(segments):
41
+ logging.debug(f"Segments received: {segments}")
42
+ logging.debug(f"Type of segments: {type(segments)}")
43
+
44
+ text = ""
45
+
46
+ if isinstance(segments, list):
47
+ for segment in segments:
48
+ logging.debug(f"Current segment: {segment}")
49
+ logging.debug(f"Type of segment: {type(segment)}")
50
+ if 'Text' in segment:
51
+ text += segment['Text'] + " "
52
+ else:
53
+ logging.warning(f"Skipping segment due to missing 'Text' key: {segment}")
54
+ else:
55
+ logging.warning(f"Unexpected type of 'segments': {type(segments)}")
56
+
57
+ return text.strip()
58
+
59
+
60
+
61
+
62
+
63
+ def chat_with_openai(api_key, input_data, custom_prompt_arg, system_prompt=None):
64
+ loaded_config_data = load_and_log_configs()
65
+ try:
66
+ # API key validation
67
+ if api_key is None or api_key.strip() == "":
68
+ logging.info("OpenAI: API key not provided as parameter")
69
+ logging.info("OpenAI: Attempting to use API key from config file")
70
+ api_key = loaded_config_data['api_keys']['openai']
71
+
72
+ if api_key is None or api_key.strip() == "":
73
+ logging.error("OpenAI: API key not found or is empty")
74
+ return "OpenAI: API Key Not Provided/Found in Config file or is empty"
75
+
76
+ logging.debug(f"OpenAI: Using API Key: {api_key[:5]}...{api_key[-5:]}")
77
+
78
+ logging.debug("OpenAI: Using provided string data for chat input")
79
+ data = input_data
80
+
81
+ logging.debug(f"OpenAI: Loaded data: {data}")
82
+ logging.debug(f"OpenAI: Type of data: {type(data)}")
83
+
84
+ if system_prompt is not None:
85
+ logging.debug(f"OpenAI: Using provided system prompt:\n\n {system_prompt}")
86
+ pass
87
+ else:
88
+ system_prompt = "You are a helpful assistant"
89
+ logging.debug(f"OpenAI: Using default system prompt:\n\n {system_prompt}")
90
+
91
+ openai_model = loaded_config_data['models']['openai'] or "gpt-4o"
92
+
93
+ headers = {
94
+ 'Authorization': f'Bearer {api_key}',
95
+ 'Content-Type': 'application/json'
96
+ }
97
+
98
+ logging.debug(
99
+ f"OpenAI API Key: {openai_api_key[:5]}...{openai_api_key[-5:] if openai_api_key else None}")
100
+ logging.debug("openai: Preparing data + prompt for submittal")
101
+ openai_prompt = f"{data} \n\n\n\n{custom_prompt_arg}"
102
+ data = {
103
+ "model": openai_model,
104
+ "messages": [
105
+ {"role": "system", "content": system_prompt},
106
+ {"role": "user", "content": openai_prompt}
107
+ ],
108
+ "max_tokens": 4096,
109
+ "temperature": 0.1
110
+ }
111
+
112
+ logging.debug("openai: Posting request")
113
+ response = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, json=data)
114
+
115
+ if response.status_code == 200:
116
+ response_data = response.json()
117
+ if 'choices' in response_data and len(response_data['choices']) > 0:
118
+ chat_response = response_data['choices'][0]['message']['content'].strip()
119
+ logging.debug("openai: Chat Sent successfully")
120
+ return chat_response
121
+ else:
122
+ logging.warning("openai: Chat response not found in the response data")
123
+ return "openai: Chat not available"
124
+ else:
125
+ logging.error(f"openai: Chat request failed with status code {response.status_code}")
126
+ logging.error(f"openai: Error response: {response.text}")
127
+ return f"openai: Failed to process chat request. Status code: {response.status_code}"
128
+ except Exception as e:
129
+ logging.error(f"openai: Error in processing: {str(e)}", exc_info=True)
130
+ return f"openai: Error occurred while processing chat request: {str(e)}"
131
+
132
+
133
+ def chat_with_anthropic(api_key, input_data, model, custom_prompt_arg, max_retries=3, retry_delay=5, system_prompt=None):
134
+ try:
135
+ loaded_config_data = load_and_log_configs()
136
+ global anthropic_api_key
137
+ # API key validation
138
+ if api_key is None:
139
+ logging.info("Anthropic: API key not provided as parameter")
140
+ logging.info("Anthropic: Attempting to use API key from config file")
141
+ anthropic_api_key = loaded_config_data['api_keys']['anthropic']
142
+
143
+ if api_key is None or api_key.strip() == "":
144
+ logging.error("Anthropic: API key not found or is empty")
145
+ return "Anthropic: API Key Not Provided/Found in Config file or is empty"
146
+
147
+ logging.debug(f"Anthropic: Using API Key: {api_key[:5]}...{api_key[-5:]}")
148
+
149
+ if system_prompt is not None:
150
+ logging.debug("Anthropic: Using provided system prompt")
151
+ pass
152
+ else:
153
+ system_prompt = "You are a helpful assistant"
154
+
155
+ logging.debug(f"AnthropicAI: Loaded data: {input_data}")
156
+ logging.debug(f"AnthropicAI: Type of data: {type(input_data)}")
157
+
158
+ anthropic_model = loaded_config_data['models']['anthropic']
159
+
160
+ headers = {
161
+ 'x-api-key': anthropic_api_key,
162
+ 'anthropic-version': '2023-06-01',
163
+ 'Content-Type': 'application/json'
164
+ }
165
+
166
+ anthropic_user_prompt = custom_prompt_arg
167
+ logging.debug(f"Anthropic: User Prompt is {anthropic_user_prompt}")
168
+ user_message = {
169
+ "role": "user",
170
+ "content": f"{input_data} \n\n\n\n{anthropic_user_prompt}"
171
+ }
172
+
173
+ data = {
174
+ "model": model,
175
+ "max_tokens": 4096, # max _possible_ tokens to return
176
+ "messages": [user_message],
177
+ "stop_sequences": ["\n\nHuman:"],
178
+ "temperature": 0.1,
179
+ "top_k": 0,
180
+ "top_p": 1.0,
181
+ "metadata": {
182
+ "user_id": "example_user_id",
183
+ },
184
+ "stream": False,
185
+ "system": f"{system_prompt}"
186
+ }
187
+
188
+ for attempt in range(max_retries):
189
+ try:
190
+ logging.debug("anthropic: Posting request to API")
191
+ response = requests.post('https://api.anthropic.com/v1/messages', headers=headers, json=data)
192
+
193
+ # Check if the status code indicates success
194
+ if response.status_code == 200:
195
+ logging.debug("anthropic: Post submittal successful")
196
+ response_data = response.json()
197
+ try:
198
+ chat_response = response_data['content'][0]['text'].strip()
199
+ logging.debug("anthropic: Chat request successful")
200
+ print("Chat request processed successfully.")
201
+ return chat_response
202
+ except (IndexError, KeyError) as e:
203
+ logging.debug("anthropic: Unexpected data in response")
204
+ print("Unexpected response format from Anthropic API:", response.text)
205
+ return None
206
+ elif response.status_code == 500: # Handle internal server error specifically
207
+ logging.debug("anthropic: Internal server error")
208
+ print("Internal server error from API. Retrying may be necessary.")
209
+ time.sleep(retry_delay)
210
+ else:
211
+ logging.debug(
212
+ f"anthropic: Failed to process chat request, status code {response.status_code}: {response.text}")
213
+ print(f"Failed to process chat request, status code {response.status_code}: {response.text}")
214
+ return None
215
+
216
+ except RequestException as e:
217
+ logging.error(f"anthropic: Network error during attempt {attempt + 1}/{max_retries}: {str(e)}")
218
+ if attempt < max_retries - 1:
219
+ time.sleep(retry_delay)
220
+ else:
221
+ return f"anthropic: Network error: {str(e)}"
222
+ except Exception as e:
223
+ logging.error(f"anthropic: Error in processing: {str(e)}")
224
+ return f"anthropic: Error occurred while processing summary with Anthropic: {str(e)}"
225
+
226
+
227
+ # Summarize with Cohere
228
+ def chat_with_cohere(api_key, input_data, model, custom_prompt_arg, system_prompt=None):
229
+ global cohere_api_key
230
+ loaded_config_data = load_and_log_configs()
231
+ try:
232
+ # API key validation
233
+ if api_key is None:
234
+ logging.info("cohere: API key not provided as parameter")
235
+ logging.info("cohere: Attempting to use API key from config file")
236
+ cohere_api_key = loaded_config_data['api_keys']['cohere']
237
+
238
+ if api_key is None or api_key.strip() == "":
239
+ logging.error("cohere: API key not found or is empty")
240
+ return "cohere: API Key Not Provided/Found in Config file or is empty"
241
+
242
+ logging.debug(f"cohere: Using API Key: {api_key[:5]}...{api_key[-5:]}")
243
+
244
+ logging.debug(f"Cohere: Loaded data: {input_data}")
245
+ logging.debug(f"Cohere: Type of data: {type(input_data)}")
246
+
247
+ cohere_model = loaded_config_data['models']['cohere']
248
+
249
+ headers = {
250
+ 'accept': 'application/json',
251
+ 'content-type': 'application/json',
252
+ 'Authorization': f'Bearer {cohere_api_key}'
253
+ }
254
+
255
+ if system_prompt is not None:
256
+ logging.debug("Anthropic: Using provided system prompt")
257
+ pass
258
+ else:
259
+ system_prompt = "You are a helpful assistant"
260
+
261
+ cohere_prompt = f"{input_data} \n\n\n\n{custom_prompt_arg}"
262
+ logging.debug(f"cohere: User Prompt being sent is {cohere_prompt}")
263
+
264
+ logging.debug(f"cohere: System Prompt being sent is {system_prompt}")
265
+
266
+ data = {
267
+ "chat_history": [
268
+ {"role": "SYSTEM", "message": f"system_prompt"},
269
+ ],
270
+ "message": f"{cohere_prompt}",
271
+ "model": model,
272
+ "connectors": [{"id": "web-search"}]
273
+ }
274
+
275
+ logging.debug("cohere: Submitting request to API endpoint")
276
+ print("cohere: Submitting request to API endpoint")
277
+ response = requests.post('https://api.cohere.ai/v1/chat', headers=headers, json=data)
278
+ response_data = response.json()
279
+ logging.debug("API Response Data: %s", response_data)
280
+
281
+ if response.status_code == 200:
282
+ if 'text' in response_data:
283
+ chat_response = response_data['text'].strip()
284
+ logging.debug("cohere: Chat request successful")
285
+ print("Chat request processed successfully.")
286
+ return chat_response
287
+ else:
288
+ logging.error("Expected data not found in API response.")
289
+ return "Expected data not found in API response."
290
+ else:
291
+ logging.error(f"cohere: API request failed with status code {response.status_code}: {response.text}")
292
+ print(f"Failed to process summary, status code {response.status_code}: {response.text}")
293
+ return f"cohere: API request failed: {response.text}"
294
+
295
+ except Exception as e:
296
+ logging.error("cohere: Error in processing: %s", str(e))
297
+ return f"cohere: Error occurred while processing summary with Cohere: {str(e)}"
298
+
299
+
300
+ # https://console.groq.com/docs/quickstart
301
+ def chat_with_groq(api_key, input_data, custom_prompt_arg, system_prompt=None):
302
+ loaded_config_data = load_and_log_configs()
303
+ try:
304
+ # API key validation
305
+ if api_key is None:
306
+ logging.info("groq: API key not provided as parameter")
307
+ logging.info("groq: Attempting to use API key from config file")
308
+ groq_api_key = loaded_config_data['api_keys']['groq']
309
+
310
+ if api_key is None or api_key.strip() == "":
311
+ logging.error("groq: API key not found or is empty")
312
+ return "groq: API Key Not Provided/Found in Config file or is empty"
313
+
314
+ logging.debug(f"groq: Using API Key: {api_key[:5]}...{api_key[-5:]}")
315
+
316
+ logging.debug(f"Groq: Loaded data: {input_data}")
317
+ logging.debug(f"Groq: Type of data: {type(input_data)}")
318
+
319
+ # Set the model to be used
320
+ groq_model = loaded_config_data['models']['groq']
321
+
322
+ headers = {
323
+ 'Authorization': f'Bearer {api_key}',
324
+ 'Content-Type': 'application/json'
325
+ }
326
+
327
+ if system_prompt is not None:
328
+ logging.debug("Groq: Using provided system prompt")
329
+ pass
330
+ else:
331
+ system_prompt = "You are a helpful assistant"
332
+
333
+ groq_prompt = f"{input_data} \n\n\n\n{custom_prompt_arg}"
334
+ logging.debug("groq: User Prompt being sent is {groq_prompt}")
335
+
336
+ logging.debug("groq: System Prompt being sent is {system_prompt}")
337
+
338
+ data = {
339
+ "messages": [
340
+ {
341
+ "role": "system",
342
+ "content": f"{system_prompt}"
343
+ },
344
+ {
345
+ "role": "user",
346
+ "content": groq_prompt
347
+ }
348
+ ],
349
+ "model": groq_model
350
+ }
351
+
352
+ logging.debug("groq: Submitting request to API endpoint")
353
+ print("groq: Submitting request to API endpoint")
354
+ response = requests.post('https://api.groq.com/openai/v1/chat/completions', headers=headers, json=data)
355
+
356
+ response_data = response.json()
357
+ logging.debug("API Response Data: %s", response_data)
358
+
359
+ if response.status_code == 200:
360
+ if 'choices' in response_data and len(response_data['choices']) > 0:
361
+ summary = response_data['choices'][0]['message']['content'].strip()
362
+ logging.debug("groq: Summarization successful")
363
+ print("Summarization successful.")
364
+ return summary
365
+ else:
366
+ logging.error("Expected data not found in API response.")
367
+ return "Expected data not found in API response."
368
+ else:
369
+ logging.error(f"groq: API request failed with status code {response.status_code}: {response.text}")
370
+ return f"groq: API request failed: {response.text}"
371
+
372
+ except Exception as e:
373
+ logging.error("groq: Error in processing: %s", str(e))
374
+ return f"groq: Error occurred while processing summary with groq: {str(e)}"
375
+
376
+
377
+ def chat_with_openrouter(api_key, input_data, custom_prompt_arg, system_prompt=None):
378
+ loaded_config_data = load_and_log_configs()
379
+ import requests
380
+ import json
381
+ global openrouter_model, openrouter_api_key
382
+ # API key validation
383
+ if api_key is None:
384
+ logging.info("openrouter: API key not provided as parameter")
385
+ logging.info("openrouter: Attempting to use API key from config file")
386
+ openrouter_api_key = loaded_config_data['api_keys']['openrouter']
387
+
388
+ if api_key is None or api_key.strip() == "":
389
+ logging.error("openrouter: API key not found or is empty")
390
+ return "openrouter: API Key Not Provided/Found in Config file or is empty"
391
+
392
+ logging.debug(f"openai: Using API Key: {api_key[:5]}...{api_key[-5:]}")
393
+
394
+ logging.debug(f"openrouter: Loaded data: {input_data}")
395
+ logging.debug(f"openrouter: Type of data: {type(input_data)}")
396
+
397
+ config = configparser.ConfigParser()
398
+ file_path = 'config.txt'
399
+
400
+ # Check if the file exists in the specified path
401
+ if os.path.exists(file_path):
402
+ config.read(file_path)
403
+ elif os.path.exists('config.txt'): # Check in the current directory
404
+ config.read('../config.txt')
405
+ else:
406
+ print("config.txt not found in the specified path or current directory.")
407
+ openrouter_model = loaded_config_data['models']['openrouter']
408
+
409
+ if system_prompt is not None:
410
+ logging.debug("OpenRouter: Using provided system prompt")
411
+ pass
412
+ else:
413
+ system_prompt = "You are a helpful assistant"
414
+
415
+ openrouter_prompt = f"{input_data} \n\n\n\n{custom_prompt_arg}"
416
+ logging.debug(f"openrouter: User Prompt being sent is {openrouter_prompt}")
417
+
418
+ logging.debug(f"openrouter: System Prompt being sent is {system_prompt}")
419
+
420
+ try:
421
+ logging.debug("openrouter: Submitting request to API endpoint")
422
+ print("openrouter: Submitting request to API endpoint")
423
+ response = requests.post(
424
+ url="https://openrouter.ai/api/v1/chat/completions",
425
+ headers={
426
+ "Authorization": f"Bearer {openrouter_api_key}",
427
+ },
428
+ data=json.dumps({
429
+ "model": f"{openrouter_model}",
430
+ "messages": [
431
+ {"role": "system", "content": system_prompt},
432
+ {"role": "user", "content": openrouter_prompt}
433
+ ]
434
+ })
435
+ )
436
+
437
+ response_data = response.json()
438
+ logging.debug("API Response Data: %s", response_data)
439
+
440
+ if response.status_code == 200:
441
+ if 'choices' in response_data and len(response_data['choices']) > 0:
442
+ summary = response_data['choices'][0]['message']['content'].strip()
443
+ logging.debug("openrouter: Chat request successful")
444
+ print("openrouter: Chat request successful.")
445
+ return summary
446
+ else:
447
+ logging.error("openrouter: Expected data not found in API response.")
448
+ return "openrouter: Expected data not found in API response."
449
+ else:
450
+ logging.error(f"openrouter: API request failed with status code {response.status_code}: {response.text}")
451
+ return f"openrouter: API request failed: {response.text}"
452
+ except Exception as e:
453
+ logging.error("openrouter: Error in processing: %s", str(e))
454
+ return f"openrouter: Error occurred while processing chat request with openrouter: {str(e)}"
455
+
456
+ # FIXME: This function is not yet implemented properly
457
+ def chat_with_huggingface(api_key, input_data, custom_prompt_arg, system_prompt=None):
458
+ loaded_config_data = load_and_log_configs()
459
+ global huggingface_api_key
460
+ logging.debug(f"huggingface: Summarization process starting...")
461
+ try:
462
+ # API key validation
463
+ if api_key is None:
464
+ logging.info("HuggingFace: API key not provided as parameter")
465
+ logging.info("HuggingFace: Attempting to use API key from config file")
466
+ huggingface_api_key = loaded_config_data['api_keys']['openai']
467
+ if api_key is None or api_key.strip() == "":
468
+ logging.error("HuggingFace: API key not found or is empty")
469
+ return "HuggingFace: API Key Not Provided/Found in Config file or is empty"
470
+ logging.debug(f"HuggingFace: Using API Key: {api_key[:5]}...{api_key[-5:]}")
471
+ headers = {
472
+ "Authorization": f"Bearer {api_key}"
473
+ }
474
+
475
+ # Setup model
476
+ huggingface_model = loaded_config_data['models']['huggingface']
477
+
478
+ API_URL = f"https://api-inference.huggingface.co/models/{huggingface_model}"
479
+ if system_prompt is not None:
480
+ logging.debug("HuggingFace: Using provided system prompt")
481
+ pass
482
+ else:
483
+ system_prompt = "You are a helpful assistant"
484
+
485
+ huggingface_prompt = f"{input_data}\n\n\n\n{custom_prompt_arg}"
486
+ logging.debug("huggingface: Prompt being sent is {huggingface_prompt}")
487
+ data = {
488
+ "inputs": f"{input_data}",
489
+ "parameters": {"max_length": 8192, "min_length": 100} # You can adjust max_length and min_length as needed
490
+ }
491
+ logging.debug("huggingface: Submitting request...")
492
+
493
+ response = requests.post(API_URL, headers=headers, json=data)
494
+
495
+ if response.status_code == 200:
496
+ summary = response.json()[0]['summary_text']
497
+ logging.debug("huggingface: Chat request successful")
498
+ print("Chat request successful.")
499
+ return summary
500
+ else:
501
+ logging.error(f"huggingface: Chat request failed with status code {response.status_code}: {response.text}")
502
+ return f"Failed to process chat request, status code {response.status_code}: {response.text}"
503
+ except Exception as e:
504
+ logging.error("huggingface: Error in processing: %s", str(e))
505
+ print(f"Error occurred while processing chat request with huggingface: {str(e)}")
506
+ return None
507
+
508
+
509
+ def chat_with_deepseek(api_key, input_data, custom_prompt_arg, system_prompt=None):
510
+ loaded_config_data = load_and_log_configs()
511
+ try:
512
+ # API key validation
513
+ if api_key is None or api_key.strip() == "":
514
+ logging.info("DeepSeek: API key not provided as parameter")
515
+ logging.info("DeepSeek: Attempting to use API key from config file")
516
+ api_key = loaded_config_data['api_keys']['deepseek']
517
+
518
+ if api_key is None or api_key.strip() == "":
519
+ logging.error("DeepSeek: API key not found or is empty")
520
+ return "DeepSeek: API Key Not Provided/Found in Config file or is empty"
521
+
522
+ logging.debug(f"DeepSeek: Using API Key: {api_key[:5]}...{api_key[-5:]}")
523
+
524
+ deepseek_model = loaded_config_data['models']['deepseek'] or "deepseek-chat"
525
+
526
+ headers = {
527
+ 'Authorization': f'Bearer {api_key}',
528
+ 'Content-Type': 'application/json'
529
+ }
530
+
531
+ if system_prompt is not None:
532
+ logging.debug(f"Deepseek: Using provided system prompt: {system_prompt}")
533
+ pass
534
+ else:
535
+ system_prompt = "You are a helpful assistant"
536
+
537
+ logging.debug(
538
+ f"Deepseek API Key: {api_key[:5]}...{api_key[-5:] if api_key else None}")
539
+ logging.debug("openai: Preparing data + prompt for submittal")
540
+ deepseek_prompt = f"{input_data} \n\n\n\n{custom_prompt_arg}"
541
+ data = {
542
+ "model": deepseek_model,
543
+ "messages": [
544
+ {"role": "system", "content": f"{system_prompt}"},
545
+ {"role": "user", "content": deepseek_prompt}
546
+ ],
547
+ "stream": False,
548
+ "temperature": 0.8
549
+ }
550
+
551
+ logging.debug("DeepSeek: Posting request")
552
+ response = requests.post('https://api.deepseek.com/chat/completions', headers=headers, json=data)
553
+
554
+ if response.status_code == 200:
555
+ response_data = response.json()
556
+ if 'choices' in response_data and len(response_data['choices']) > 0:
557
+ summary = response_data['choices'][0]['message']['content'].strip()
558
+ logging.debug("DeepSeek: Chat request successful")
559
+ return summary
560
+ else:
561
+ logging.warning("DeepSeek: Chat response not found in the response data")
562
+ return "DeepSeek: Chat response not available"
563
+ else:
564
+ logging.error(f"DeepSeek: Chat request failed with status code {response.status_code}")
565
+ logging.error(f"DeepSeek: Error response: {response.text}")
566
+ return f"DeepSeek: Failed to chat request summary. Status code: {response.status_code}"
567
+ except Exception as e:
568
+ logging.error(f"DeepSeek: Error in processing: {str(e)}", exc_info=True)
569
+ return f"DeepSeek: Error occurred while processing chat request: {str(e)}"
570
+
571
+
572
+
573
+ # Stashed in here since OpenAI usage.... #FIXME
574
+ # FIXME - https://docs.vllm.ai/en/latest/getting_started/quickstart.html .... Great docs.
575
+ def chat_with_vllm(input_data, custom_prompt_input, api_key=None, vllm_api_url="http://127.0.0.1:8000/v1/chat/completions", system_prompt=None):
576
+ loaded_config_data = load_and_log_configs()
577
+ llm_model = loaded_config_data['models']['vllm']
578
+ # API key validation
579
+ if api_key is None:
580
+ logging.info("vLLM: API key not provided as parameter")
581
+ logging.info("vLLM: Attempting to use API key from config file")
582
+ api_key = loaded_config_data['api_keys']['llama']
583
+
584
+ if api_key is None or api_key.strip() == "":
585
+ logging.info("vLLM: API key not found or is empty")
586
+ vllm_client = OpenAI(
587
+ base_url=vllm_api_url,
588
+ api_key=custom_prompt_input
589
+ )
590
+
591
+ if isinstance(input_data, str) and os.path.isfile(input_data):
592
+ logging.debug("vLLM: Loading json data for summarization")
593
+ with open(input_data, 'r') as file:
594
+ data = json.load(file)
595
+ else:
596
+ logging.debug("vLLM: Using provided string data for summarization")
597
+ data = input_data
598
+
599
+ logging.debug(f"vLLM: Loaded data: {data}")
600
+ logging.debug(f"vLLM: Type of data: {type(data)}")
601
+
602
+ if isinstance(data, dict) and 'summary' in data:
603
+ # If the loaded data is a dictionary and already contains a summary, return it
604
+ logging.debug("vLLM: Summary already exists in the loaded data")
605
+ return data['summary']
606
+
607
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
608
+ if isinstance(data, list):
609
+ segments = data
610
+ text = extract_text_from_segments(segments)
611
+ elif isinstance(data, str):
612
+ text = data
613
+ else:
614
+ raise ValueError("Invalid input data format")
615
+
616
+
617
+ custom_prompt = custom_prompt_input
618
+
619
+ completion = client.chat.completions.create(
620
+ model=llm_model,
621
+ messages=[
622
+ {"role": "system", "content": f"{system_prompt}"},
623
+ {"role": "user", "content": f"{text} \n\n\n\n{custom_prompt}"}
624
+ ]
625
+ )
626
+ vllm_summary = completion.choices[0].message.content
627
+ return vllm_summary
628
+
629
+
630
+
631
+ #
632
+ #
633
+ #######################################################################################################################
App_Function_Libraries/LLM_API_Calls_Local.py ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Local_Summarization_Lib.py
2
+ #########################################
3
+ # Local Summarization Library
4
+ # This library is used to perform summarization with a 'local' inference engine.
5
+ #
6
+ ####
7
+
8
+ ####################
9
+ # Function List
10
+ # FIXME - UPDATE Function Arguments
11
+ # 1. chat_with_local_llm(text, custom_prompt_arg)
12
+ # 2. chat_with_llama(api_url, text, token, custom_prompt)
13
+ # 3. chat_with_kobold(api_url, text, kobold_api_token, custom_prompt)
14
+ # 4. chat_with_oobabooga(api_url, text, ooba_api_token, custom_prompt)
15
+ # 5. chat_with_vllm(vllm_api_url, vllm_api_key_function_arg, llm_model, text, vllm_custom_prompt_function_arg)
16
+ # 6. chat_with_tabbyapi(tabby_api_key, tabby_api_IP, text, tabby_model, custom_prompt)
17
+ # 7. save_summary_to_file(summary, file_path)
18
+ #
19
+ #
20
+ ####################
21
+ # Import necessary libraries
22
+ import json
23
+ # Import Local
24
+ from Utils import *
25
+ #
26
+ #######################################################################################################################
27
+ # Function Definitions
28
+ #
29
+
30
+
31
+ def chat_with_local_llm(input_data, user_prompt, system_prompt=None):
32
+ try:
33
+ if system_prompt is None:
34
+ system_prompt_arg = "You are a helpful assistant."
35
+
36
+ headers = {
37
+ 'Content-Type': 'application/json'
38
+ }
39
+
40
+ logging.debug("Local LLM: Preparing data + prompt for submittal")
41
+ local_llm_prompt = f"{user_prompt}\n\n\n\n{input_data} "
42
+ data = {
43
+ "messages": [
44
+ {
45
+ "role": "system",
46
+ "content": f"{system_prompt}"
47
+ },
48
+ {
49
+ "role": "user",
50
+ "content": f"{local_llm_prompt}"
51
+ }
52
+ ],
53
+ "max_tokens": 28000, # Adjust tokens as needed
54
+ }
55
+ logging.debug("Local LLM: System Prompt to be used: %s", system_prompt)
56
+ logging.debug("Local LLM: User Prompt to be used: %s", user_prompt)
57
+ logging.debug("Local LLM: Posting request")
58
+ response = requests.post('http://127.0.0.1:8080/v1/chat/completions', headers=headers, json=data)
59
+
60
+ if response.status_code == 200:
61
+ response_data = response.json()
62
+ if 'choices' in response_data and len(response_data['choices']) > 0:
63
+ summary = response_data['choices'][0]['message']['content'].strip()
64
+ logging.debug("Local LLM: Chat request successful")
65
+ print("Local LLM: Chat request successful.")
66
+ return summary
67
+ else:
68
+ logging.warning("Local LLM: Chat response not found in the response data")
69
+ return "Local LLM: Chat response not available"
70
+ else:
71
+ logging.debug("Local LLM: Chat request failed")
72
+ print("Local LLM: Failed to process Chat response:", response.text)
73
+ return "Local LLM: Failed to process Chat response"
74
+ except Exception as e:
75
+ logging.debug("Local LLM: Error in processing: %s", str(e))
76
+ print("Error occurred while processing Chat request with Local LLM:", str(e))
77
+ return "Local LLM: Error occurred while processing Chat response"
78
+
79
+ def chat_with_llama(input_data, custom_prompt, api_url="http://127.0.0.1:8080/completion", api_key=None, system_prompt=None):
80
+ loaded_config_data = load_and_log_configs()
81
+ try:
82
+ # API key validation
83
+ if api_key is None:
84
+ logging.info("llama.cpp: API key not provided as parameter")
85
+ logging.info("llama.cpp: Attempting to use API key from config file")
86
+ api_key = loaded_config_data['api_keys']['llama']
87
+
88
+ if api_key is None or api_key.strip() == "":
89
+ logging.info("llama.cpp: API key not found or is empty")
90
+
91
+ logging.debug(f"llama.cpp: Using API Key: {api_key[:5]}...{api_key[-5:]}")
92
+
93
+ headers = {
94
+ 'accept': 'application/json',
95
+ 'content-type': 'application/json',
96
+ }
97
+ if len(api_key) > 5:
98
+ headers['Authorization'] = f'Bearer {api_key}'
99
+
100
+ if system_prompt is None:
101
+ system_prompt = "You are a helpful AI assistant that provides accurate and concise information."
102
+
103
+ logging.debug("Llama.cpp: System prompt being used is: %s", system_prompt)
104
+ logging.debug("Llama.cpp: User prompt being used is: %s", custom_prompt)
105
+
106
+
107
+ llama_prompt = f"{custom_prompt} \n\n\n\n{input_data}"
108
+ logging.debug(f"llama: Prompt being sent is {llama_prompt}")
109
+
110
+ data = {
111
+ "prompt": f"{llama_prompt}",
112
+ "system_prompt": f"{system_prompt}"
113
+ }
114
+
115
+ logging.debug("llama: Submitting request to API endpoint")
116
+ print("llama: Submitting request to API endpoint")
117
+ response = requests.post(api_url, headers=headers, json=data)
118
+ response_data = response.json()
119
+ logging.debug("API Response Data: %s", response_data)
120
+
121
+ if response.status_code == 200:
122
+ # if 'X' in response_data:
123
+ logging.debug(response_data)
124
+ summary = response_data['content'].strip()
125
+ logging.debug("llama: Summarization successful")
126
+ print("Summarization successful.")
127
+ return summary
128
+ else:
129
+ logging.error(f"Llama: API request failed with status code {response.status_code}: {response.text}")
130
+ return f"Llama: API request failed: {response.text}"
131
+
132
+ except Exception as e:
133
+ logging.error("Llama: Error in processing: %s", str(e))
134
+ return f"Llama: Error occurred while processing summary with llama: {str(e)}"
135
+
136
+
137
+ # System prompts not supported through API requests.
138
+ # https://lite.koboldai.net/koboldcpp_api#/api%2Fv1/post_api_v1_generate
139
+ def chat_with_kobold(input_data, api_key, custom_prompt_input, kobold_api_IP="http://127.0.0.1:5001/api/v1/generate"):
140
+ loaded_config_data = load_and_log_configs()
141
+ try:
142
+ # API key validation
143
+ if api_key is None:
144
+ logging.info("Kobold.cpp: API key not provided as parameter")
145
+ logging.info("Kobold.cpp: Attempting to use API key from config file")
146
+ api_key = loaded_config_data['api_keys']['kobold']
147
+
148
+ if api_key is None or api_key.strip() == "":
149
+ logging.info("Kobold.cpp: API key not found or is empty")
150
+
151
+ headers = {
152
+ 'accept': 'application/json',
153
+ 'content-type': 'application/json',
154
+ }
155
+
156
+ kobold_prompt = f"{custom_prompt_input} \n\n\n\n{input_data}"
157
+ logging.debug("kobold: Prompt being sent is {kobold_prompt}")
158
+
159
+ # FIXME
160
+ # Values literally c/p from the api docs....
161
+ data = {
162
+ "max_context_length": 8096,
163
+ "max_length": 4096,
164
+ "prompt": f"{custom_prompt_input}\n\n\n\n{input_data}"
165
+ }
166
+
167
+ logging.debug("kobold: Submitting request to API endpoint")
168
+ print("kobold: Submitting request to API endpoint")
169
+ response = requests.post(kobold_api_IP, headers=headers, json=data)
170
+ response_data = response.json()
171
+ logging.debug("kobold: API Response Data: %s", response_data)
172
+
173
+ if response.status_code == 200:
174
+ if 'results' in response_data and len(response_data['results']) > 0:
175
+ summary = response_data['results'][0]['text'].strip()
176
+ logging.debug("kobold: Chat request successful!")
177
+ print("Chat request successful!")
178
+ return summary
179
+ else:
180
+ logging.error("Expected data not found in API response.")
181
+ return "Expected data not found in API response."
182
+ else:
183
+ logging.error(f"kobold: API request failed with status code {response.status_code}: {response.text}")
184
+ return f"kobold: API request failed: {response.text}"
185
+
186
+ except Exception as e:
187
+ logging.error("kobold: Error in processing: %s", str(e))
188
+ return f"kobold: Error occurred while processing chat response with kobold: {str(e)}"
189
+
190
+ # System prompt doesn't work. FIXME
191
+ # https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API
192
+ def chat_with_oobabooga(input_data, api_key, custom_prompt, api_url="http://127.0.0.1:5000/v1/chat/completions", system_prompt=None):
193
+ loaded_config_data = load_and_log_configs()
194
+ try:
195
+ # API key validation
196
+ if api_key is None:
197
+ logging.info("ooba: API key not provided as parameter")
198
+ logging.info("ooba: Attempting to use API key from config file")
199
+ api_key = loaded_config_data['api_keys']['ooba']
200
+
201
+ if api_key is None or api_key.strip() == "":
202
+ logging.info("ooba: API key not found or is empty")
203
+
204
+ if system_prompt is None:
205
+ system_prompt = "You are a helpful AI assistant that provides accurate and concise information."
206
+
207
+ headers = {
208
+ 'accept': 'application/json',
209
+ 'content-type': 'application/json',
210
+ }
211
+
212
+ # prompt_text = "I like to eat cake and bake cakes. I am a baker. I work in a French bakery baking cakes. It
213
+ # is a fun job. I have been baking cakes for ten years. I also bake lots of other baked goods, but cakes are
214
+ # my favorite." prompt_text += f"\n\n{text}" # Uncomment this line if you want to include the text variable
215
+ ooba_prompt = f"{input_data}" + f"\n\n\n\n{custom_prompt}"
216
+ logging.debug("ooba: Prompt being sent is {ooba_prompt}")
217
+
218
+ data = {
219
+ "mode": "chat",
220
+ "character": "Example",
221
+ "messages": [{"role": "user", "content": ooba_prompt}]
222
+ }
223
+
224
+ logging.debug("ooba: Submitting request to API endpoint")
225
+ print("ooba: Submitting request to API endpoint")
226
+ response = requests.post(api_url, headers=headers, json=data, verify=False)
227
+ logging.debug("ooba: API Response Data: %s", response)
228
+
229
+ if response.status_code == 200:
230
+ response_data = response.json()
231
+ summary = response.json()['choices'][0]['message']['content']
232
+ logging.debug("ooba: Summarization successful")
233
+ print("Summarization successful.")
234
+ return summary
235
+ else:
236
+ logging.error(f"oobabooga: API request failed with status code {response.status_code}: {response.text}")
237
+ return f"ooba: API request failed with status code {response.status_code}: {response.text}"
238
+
239
+ except Exception as e:
240
+ logging.error("ooba: Error in processing: %s", str(e))
241
+ return f"ooba: Error occurred while processing summary with oobabooga: {str(e)}"
242
+
243
+
244
+ # FIXME - Install is more trouble than care to deal with right now.
245
+ def chat_with_tabbyapi(input_data, custom_prompt_input, api_key=None, api_IP="http://127.0.0.1:5000/v1/chat/completions"):
246
+ loaded_config_data = load_and_log_configs()
247
+ model = loaded_config_data['models']['tabby']
248
+ # API key validation
249
+ if api_key is None:
250
+ logging.info("tabby: API key not provided as parameter")
251
+ logging.info("tabby: Attempting to use API key from config file")
252
+ api_key = loaded_config_data['api_keys']['tabby']
253
+
254
+ if api_key is None or api_key.strip() == "":
255
+ logging.info("tabby: API key not found or is empty")
256
+
257
+ if isinstance(input_data, str) and os.path.isfile(input_data):
258
+ logging.debug("tabby: Loading json data for summarization")
259
+ with open(input_data, 'r') as file:
260
+ data = json.load(file)
261
+ else:
262
+ logging.debug("tabby: Using provided string data for summarization")
263
+ data = input_data
264
+
265
+ logging.debug(f"tabby: Loaded data: {data}")
266
+ logging.debug(f"tabby: Type of data: {type(data)}")
267
+
268
+ if isinstance(data, dict) and 'summary' in data:
269
+ # If the loaded data is a dictionary and already contains a summary, return it
270
+ logging.debug("tabby: Summary already exists in the loaded data")
271
+ return data['summary']
272
+
273
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
274
+ if isinstance(data, list):
275
+ segments = data
276
+ text = extract_text_from_segments(segments)
277
+ elif isinstance(data, str):
278
+ text = data
279
+ else:
280
+ raise ValueError("Invalid input data format")
281
+
282
+ headers = {
283
+ 'Authorization': f'Bearer {api_key}',
284
+ 'Content-Type': 'application/json'
285
+ }
286
+ data2 = {
287
+ 'text': text,
288
+ 'model': 'tabby' # Specify the model if needed
289
+ }
290
+ tabby_api_ip = loaded_config_data['local_apis']['tabby']['ip']
291
+ try:
292
+ response = requests.post(tabby_api_ip, headers=headers, json=data2)
293
+ response.raise_for_status()
294
+ summary = response.json().get('summary', '')
295
+ return summary
296
+ except requests.exceptions.RequestException as e:
297
+ logging.error(f"Error summarizing with TabbyAPI: {e}")
298
+ return "Error summarizing with TabbyAPI."
299
+
300
+
301
+ # FIXME aphrodite engine - code was literally tab complete in one go from copilot... :/
302
+ def chat_with_aphrodite(input_data, custom_prompt_input, api_key=None, api_IP="http://" + load_and_log_configs()['local_apis']['aphrodite']['ip']):
303
+ loaded_config_data = load_and_log_configs()
304
+ model = loaded_config_data['models']['aphrodite']
305
+ # API key validation
306
+ if api_key is None:
307
+ logging.info("aphrodite: API key not provided as parameter")
308
+ logging.info("aphrodite: Attempting to use API key from config file")
309
+ api_key = loaded_config_data['api_keys']['aphrodite']
310
+
311
+ if api_key is None or api_key.strip() == "":
312
+ logging.info("aphrodite: API key not found or is empty")
313
+
314
+ headers = {
315
+ 'Authorization': f'Bearer {api_key}',
316
+ 'Content-Type': 'application/json'
317
+ }
318
+ data2 = {
319
+ 'text': input_data,
320
+ }
321
+ try:
322
+ response = requests.post(api_IP, headers=headers, json=data2)
323
+ response.raise_for_status()
324
+ summary = response.json().get('summary', '')
325
+ return summary
326
+ except requests.exceptions.RequestException as e:
327
+ logging.error(f"Error summarizing with Aphrodite: {e}")
328
+ return "Error summarizing with Aphrodite."
329
+
330
+
331
+
332
+
333
+ def save_summary_to_file(summary, file_path):
334
+ logging.debug("Now saving summary to file...")
335
+ base_name = os.path.splitext(os.path.basename(file_path))[0]
336
+ summary_file_path = os.path.join(os.path.dirname(file_path), base_name + '_summary.txt')
337
+ os.makedirs(os.path.dirname(summary_file_path), exist_ok=True)
338
+ logging.debug("Opening summary file for writing, *segments.json with *_summary.txt")
339
+ with open(summary_file_path, 'w') as file:
340
+ file.write(summary)
341
+ logging.info(f"Summary saved to file: {summary_file_path}")
342
+
343
+ #
344
+ #
345
+ #######################################################################################################################
346
+
347
+
348
+
App_Function_Libraries/Local_File_Processing_Lib.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Local_File_Processing_Lib.py
2
+ #########################################
3
+ # Local File Processing and File Path Handling Library
4
+ # This library is used to handle processing local filepaths and URLs.
5
+ # It checks for the OS, the availability of the GPU, and the availability of the ffmpeg executable.
6
+ # If the GPU is available, it asks the user if they would like to use it for processing.
7
+ # If ffmpeg is not found, it asks the user if they would like to download it.
8
+ # The script will exit if the user chooses not to download ffmpeg.
9
+ ####
10
+
11
+ ####################
12
+ # Function List
13
+ #
14
+ # 1. read_paths_from_file(file_path)
15
+ # 2. process_path(path)
16
+ # 3. process_local_file(file_path)
17
+ # 4. read_paths_from_file(file_path: str) -> List[str]
18
+ #
19
+ ####################
20
+
21
+ # Import necessary libraries
22
+ # Import Local
23
+ from App_Function_Libraries.Audio_Transcription_Lib import convert_to_wav
24
+ from App_Function_Libraries.Video_DL_Ingestion_Lib import *
25
+ from App_Function_Libraries.Video_DL_Ingestion_Lib import get_youtube
26
+ from App_Function_Libraries.Utils import normalize_title, create_download_directory
27
+
28
+ #######################################################################################################################
29
+ # Function Definitions
30
+ #
31
+
32
+ def read_paths_from_file(file_path):
33
+ """ Reads a file containing URLs or local file paths and returns them as a list. """
34
+ paths = [] # Initialize paths as an empty list
35
+ with open(file_path, 'r') as file:
36
+ paths = file.readlines()
37
+ return [path.strip() for path in paths]
38
+
39
+
40
+ def process_path(path):
41
+ """ Decides whether the path is a URL or a local file and processes accordingly. """
42
+ if path.startswith('http'):
43
+ logging.debug("file is a URL")
44
+ # For YouTube URLs, modify to download and extract info
45
+ return get_youtube(path)
46
+ elif os.path.exists(path):
47
+ logging.debug("File is a path")
48
+ # For local files, define a function to handle them
49
+ return process_local_file(path)
50
+ else:
51
+ logging.error(f"Path does not exist: {path}")
52
+ return None
53
+
54
+
55
+ # FIXME - ingest_text is not used, need to confirm.
56
+ def process_local_file(file_path, ingest_text=False):
57
+ logging.info(f"Processing local file: {file_path}")
58
+ file_extension = os.path.splitext(file_path)[1].lower()
59
+
60
+ if os.path.isfile(file_path):
61
+ if file_path.lower().endswith('.txt'):
62
+ if ingest_text:
63
+ # Treat as content to be ingested
64
+ return os.path.dirname(file_path), {'title': os.path.basename(file_path)}, file_path
65
+ else:
66
+ # Treat as potential list of URLs
67
+ with open(file_path, 'r') as file:
68
+ urls = file.read().splitlines()
69
+ return None, None, urls
70
+ elif file_path.lower().endswith(('.mp4', '.avi', '.mov', '.wav', '.mp3', '.m4a')):
71
+ # Handle video and audio files (existing code)
72
+ title = normalize_title(os.path.splitext(os.path.basename(file_path))[0])
73
+ info_dict = {'title': title}
74
+ logging.debug(f"Creating {title} directory...")
75
+ download_path = create_download_directory(title)
76
+ logging.debug(f"Converting '{title}' to an audio file (wav).")
77
+ audio_file = convert_to_wav(file_path)
78
+ logging.debug(f"'{title}' successfully converted to an audio file (wav).")
79
+ return download_path, info_dict, audio_file
80
+ else:
81
+ logging.error(f"File not found: {file_path}")
82
+ return None, None, None
83
+
84
+
85
+
86
+
87
+
88
+ #
89
+ #
90
+ #######################################################################################################################
App_Function_Libraries/Local_LLM_Inference_Engine_Lib.py ADDED
@@ -0,0 +1,590 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Local_LLM_Inference_Engine_Lib.py
2
+ #########################################
3
+ # Local LLM Inference Engine Library
4
+ # This library is used to handle downloading, configuring, and launching the Local LLM Inference Engine
5
+ # via (llama.cpp via llamafile)
6
+ #
7
+ #
8
+ ####
9
+ ####################
10
+ # Function List
11
+ #
12
+ # 1. download_latest_llamafile(repo, asset_name_prefix, output_filename)
13
+ # 2. download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5)
14
+ # 3. verify_checksum(file_path, expected_checksum)
15
+ # 4. cleanup_process()
16
+ # 5. signal_handler(sig, frame)
17
+ # 6. local_llm_function()
18
+ # 7. launch_in_new_terminal_windows(executable, args)
19
+ # 8. launch_in_new_terminal_linux(executable, args)
20
+ # 9. launch_in_new_terminal_mac(executable, args)
21
+ #
22
+ ####################
23
+ # Import necessary libraries
24
+ from asyncio import subprocess
25
+ import atexit
26
+ import re
27
+ import sys
28
+ import time
29
+ # Import 3rd-pary Libraries
30
+ #
31
+ # Import Local
32
+ from Article_Summarization_Lib import *
33
+ from App_Function_Libraries.Utils import download_file
34
+ #
35
+ #
36
+ #######################################################################################################################
37
+ # Function Definitions
38
+ #
39
+
40
+ # Download latest llamafile from Github
41
+ # Example usage
42
+ #repo = "Mozilla-Ocho/llamafile"
43
+ #asset_name_prefix = "llamafile-"
44
+ #output_filename = "llamafile"
45
+ #download_latest_llamafile(repo, asset_name_prefix, output_filename)
46
+
47
+ # THIS SHOULD ONLY BE CALLED IF THE USER IS USING THE GUI TO SETUP LLAMAFILE
48
+ # Function is used to download only llamafile
49
+ def download_latest_llamafile_no_model(output_filename):
50
+ # Check if the file already exists
51
+ print("Checking for and downloading Llamafile it it doesn't already exist...")
52
+ if os.path.exists(output_filename):
53
+ print("Llamafile already exists. Skipping download.")
54
+ logging.debug(f"{output_filename} already exists. Skipping download.")
55
+ llamafile_exists = True
56
+ else:
57
+ llamafile_exists = False
58
+
59
+ if llamafile_exists == True:
60
+ pass
61
+ else:
62
+ # Establish variables for Llamafile download
63
+ repo = "Mozilla-Ocho/llamafile"
64
+ asset_name_prefix = "llamafile-"
65
+ # Get the latest release information
66
+ latest_release_url = f"https://api.github.com/repos/{repo}/releases/latest"
67
+ response = requests.get(latest_release_url)
68
+ if response.status_code != 200:
69
+ raise Exception(f"Failed to fetch latest release info: {response.status_code}")
70
+
71
+ latest_release_data = response.json()
72
+ tag_name = latest_release_data['tag_name']
73
+
74
+ # Get the release details using the tag name
75
+ release_details_url = f"https://api.github.com/repos/{repo}/releases/tags/{tag_name}"
76
+ response = requests.get(release_details_url)
77
+ if response.status_code != 200:
78
+ raise Exception(f"Failed to fetch release details for tag {tag_name}: {response.status_code}")
79
+
80
+ release_data = response.json()
81
+ assets = release_data.get('assets', [])
82
+
83
+ # Find the asset with the specified prefix
84
+ asset_url = None
85
+ for asset in assets:
86
+ if re.match(f"{asset_name_prefix}.*", asset['name']):
87
+ asset_url = asset['browser_download_url']
88
+ break
89
+
90
+ if not asset_url:
91
+ raise Exception(f"No asset found with prefix {asset_name_prefix}")
92
+
93
+ # Download the asset
94
+ response = requests.get(asset_url)
95
+ if response.status_code != 200:
96
+ raise Exception(f"Failed to download asset: {response.status_code}")
97
+
98
+ print("Llamafile downloaded successfully.")
99
+ logging.debug("Main: Llamafile downloaded successfully.")
100
+
101
+ # Save the file
102
+ with open(output_filename, 'wb') as file:
103
+ file.write(response.content)
104
+
105
+ logging.debug(f"Downloaded {output_filename} from {asset_url}")
106
+ print(f"Downloaded {output_filename} from {asset_url}")
107
+ return output_filename
108
+
109
+
110
+ # FIXME - Add option in GUI for selecting the other models for download
111
+ # Should only be called from 'local_llm_gui_function' - if its called from anywhere else, shits broken.
112
+ # Function is used to download llamafile + A model from Huggingface
113
+ def download_latest_llamafile_through_gui(repo, asset_name_prefix, output_filename):
114
+ # Check if the file already exists
115
+ print("Checking for and downloading Llamafile it it doesn't already exist...")
116
+ if os.path.exists(output_filename):
117
+ print("Llamafile already exists. Skipping download.")
118
+ logging.debug(f"{output_filename} already exists. Skipping download.")
119
+ llamafile_exists = True
120
+ else:
121
+ llamafile_exists = False
122
+
123
+ if llamafile_exists == True:
124
+ pass
125
+ else:
126
+ # Get the latest release information
127
+ latest_release_url = f"https://api.github.com/repos/{repo}/releases/latest"
128
+ response = requests.get(latest_release_url)
129
+ if response.status_code != 200:
130
+ raise Exception(f"Failed to fetch latest release info: {response.status_code}")
131
+
132
+ latest_release_data = response.json()
133
+ tag_name = latest_release_data['tag_name']
134
+
135
+ # Get the release details using the tag name
136
+ release_details_url = f"https://api.github.com/repos/{repo}/releases/tags/{tag_name}"
137
+ response = requests.get(release_details_url)
138
+ if response.status_code != 200:
139
+ raise Exception(f"Failed to fetch release details for tag {tag_name}: {response.status_code}")
140
+
141
+ release_data = response.json()
142
+ assets = release_data.get('assets', [])
143
+
144
+ # Find the asset with the specified prefix
145
+ asset_url = None
146
+ for asset in assets:
147
+ if re.match(f"{asset_name_prefix}.*", asset['name']):
148
+ asset_url = asset['browser_download_url']
149
+ break
150
+
151
+ if not asset_url:
152
+ raise Exception(f"No asset found with prefix {asset_name_prefix}")
153
+
154
+ # Download the asset
155
+ response = requests.get(asset_url)
156
+ if response.status_code != 200:
157
+ raise Exception(f"Failed to download asset: {response.status_code}")
158
+
159
+ print("Llamafile downloaded successfully.")
160
+ logging.debug("Main: Llamafile downloaded successfully.")
161
+
162
+ # Save the file
163
+ with open(output_filename, 'wb') as file:
164
+ file.write(response.content)
165
+
166
+ logging.debug(f"Downloaded {output_filename} from {asset_url}")
167
+ print(f"Downloaded {output_filename} from {asset_url}")
168
+
169
+ # Check to see if the LLM already exists, and if not, download the LLM
170
+ print("Checking for and downloading LLM from Huggingface if needed...")
171
+ logging.debug("Main: Checking and downloading LLM from Huggingface if needed...")
172
+ mistral_7b_instruct_v0_2_q8_0_llamafile = "mistral-7b-instruct-v0.2.Q8_0.llamafile"
173
+ Samantha_Mistral_Instruct_7B_Bulleted_Notes_Q8 = "samantha-mistral-instruct-7b-bulleted-notes.Q8_0.gguf"
174
+ Phi_3_mini_128k_instruct_Q8_0_gguf = "Phi-3-mini-128k-instruct-Q8_0.gguf"
175
+ if os.path.exists(mistral_7b_instruct_v0_2_q8_0_llamafile):
176
+ llamafile_llm_url = "https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q8_0.llamafile?download=true"
177
+ print("Model is already downloaded. Skipping download.")
178
+ pass
179
+ elif os.path.exists(Samantha_Mistral_Instruct_7B_Bulleted_Notes_Q8):
180
+ llamafile_llm_url = "https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q8_0.llamafile?download=true"
181
+ print("Model is already downloaded. Skipping download.")
182
+ pass
183
+ elif os.path.exists(mistral_7b_instruct_v0_2_q8_0_llamafile):
184
+ llamafile_llm_url = "https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q8_0.llamafile?download=true"
185
+ print("Model is already downloaded. Skipping download.")
186
+ pass
187
+ else:
188
+ logging.debug("Main: Checking and downloading LLM from Huggingface if needed...")
189
+ print("Downloading LLM from Huggingface...")
190
+ time.sleep(1)
191
+ print("Gonna be a bit...")
192
+ time.sleep(1)
193
+ print("Like seriously, an 8GB file...")
194
+ time.sleep(2)
195
+ # Not needed for GUI
196
+ # dl_check = input("Final chance to back out, hit 'N'/'n' to cancel, or 'Y'/'y' to continue: ")
197
+ #if dl_check == "N" or dl_check == "n":
198
+ # exit()
199
+ x = 2
200
+ if x != 1:
201
+ print("Uhhhh how'd you get here...?")
202
+ exit()
203
+ else:
204
+ print("Downloading LLM from Huggingface...")
205
+ # Establish hash values for LLM models
206
+ mistral_7b_instruct_v0_2_q8_gguf_sha256 = "f326f5f4f137f3ad30f8c9cc21d4d39e54476583e8306ee2931d5a022cb85b06"
207
+ samantha_mistral_instruct_7b_bulleted_notes_q8_0_gguf_sha256 = "6334c1ab56c565afd86535271fab52b03e67a5e31376946bce7bf5c144e847e4"
208
+ mistral_7b_instruct_v0_2_q8_0_llamafile_sha256 = "1ee6114517d2f770425c880e5abc443da36b193c82abec8e2885dd7ce3b9bfa6"
209
+ global llm_choice
210
+
211
+ # FIXME - llm_choice
212
+ llm_choice = 2
213
+ llm_choice = input("Which LLM model would you like to download? 1. Mistral-7B-Instruct-v0.2-GGUF or 2. Samantha-Mistral-Instruct-7B-Bulleted-Notes) (plain or 'custom') or MS Flavor: Phi-3-mini-128k-instruct-Q8_0.gguf \n\n\tPress '1' or '2' or '3' to specify: ")
214
+ while llm_choice != "1" and llm_choice != "2" and llm_choice != "3":
215
+ print("Invalid choice. Please try again.")
216
+ if llm_choice == "1":
217
+ llm_download_model = "Mistral-7B-Instruct-v0.2-Q8.llamafile"
218
+ mistral_7b_instruct_v0_2_q8_0_llamafile_sha256 = "1ee6114517d2f770425c880e5abc443da36b193c82abec8e2885dd7ce3b9bfa6"
219
+ llm_download_model_hash = mistral_7b_instruct_v0_2_q8_0_llamafile_sha256
220
+ llamafile_llm_url = "https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q8_0.llamafile?download=true"
221
+ llamafile_llm_output_filename = "mistral-7b-instruct-v0.2.Q8_0.llamafile"
222
+ download_file(llamafile_llm_url, llamafile_llm_output_filename, llm_download_model_hash)
223
+ elif llm_choice == "2":
224
+ llm_download_model = "Samantha-Mistral-Instruct-7B-Bulleted-Notes-Q8.gguf"
225
+ samantha_mistral_instruct_7b_bulleted_notes_q8_0_gguf_sha256 = "6334c1ab56c565afd86535271fab52b03e67a5e31376946bce7bf5c144e847e4"
226
+ llm_download_model_hash = samantha_mistral_instruct_7b_bulleted_notes_q8_0_gguf_sha256
227
+ llamafile_llm_output_filename = "samantha-mistral-instruct-7b-bulleted-notes.Q8_0.gguf"
228
+ llamafile_llm_url = "https://huggingface.co/cognitivetech/samantha-mistral-instruct-7b-bulleted-notes-GGUF/resolve/main/samantha-mistral-instruct-7b-bulleted-notes.Q8_0.gguf?download=true"
229
+ download_file(llamafile_llm_url, llamafile_llm_output_filename, llm_download_model_hash)
230
+ elif llm_choice == "3":
231
+ llm_download_model = "Phi-3-mini-128k-instruct-Q8_0.gguf"
232
+ Phi_3_mini_128k_instruct_Q8_0_gguf_sha256 = "6817b66d1c3c59ab06822e9732f0e594eea44e64cae2110906eac9d17f75d193"
233
+ llm_download_model_hash = Phi_3_mini_128k_instruct_Q8_0_gguf_sha256
234
+ llamafile_llm_output_filename = "Phi-3-mini-128k-instruct-Q8_0.gguf"
235
+ llamafile_llm_url = "https://huggingface.co/gaianet/Phi-3-mini-128k-instruct-GGUF/resolve/main/Phi-3-mini-128k-instruct-Q8_0.gguf?download=true"
236
+ download_file(llamafile_llm_url, llamafile_llm_output_filename, llm_download_model_hash)
237
+ elif llm_choice == "4": # FIXME - and meta_Llama_3_8B_Instruct_Q8_0_llamafile_exists == False:
238
+ meta_Llama_3_8B_Instruct_Q8_0_llamafile_sha256 = "406868a97f02f57183716c7e4441d427f223fdbc7fa42964ef10c4d60dd8ed37"
239
+ llm_download_model_hash = meta_Llama_3_8B_Instruct_Q8_0_llamafile_sha256
240
+ llamafile_llm_output_filename = " Meta-Llama-3-8B-Instruct.Q8_0.llamafile"
241
+ llamafile_llm_url = "https://huggingface.co/Mozilla/Meta-Llama-3-8B-Instruct-llamafile/resolve/main/Meta-Llama-3-8B-Instruct.Q8_0.llamafile?download=true"
242
+ else:
243
+ print("Invalid choice. Please try again.")
244
+ return output_filename
245
+
246
+
247
+ # Maybe replace/ dead code? FIXME
248
+ # Function is used to download llamafile + A model from Huggingface
249
+ def download_latest_llamafile(repo, asset_name_prefix, output_filename):
250
+ # Check if the file already exists
251
+ print("Checking for and downloading Llamafile it it doesn't already exist...")
252
+ if os.path.exists(output_filename):
253
+ print("Llamafile already exists. Skipping download.")
254
+ logging.debug(f"{output_filename} already exists. Skipping download.")
255
+ llamafile_exists = True
256
+ else:
257
+ llamafile_exists = False
258
+
259
+ if llamafile_exists == True:
260
+ pass
261
+ else:
262
+ # Get the latest release information
263
+ latest_release_url = f"https://api.github.com/repos/{repo}/releases/latest"
264
+ response = requests.get(latest_release_url)
265
+ if response.status_code != 200:
266
+ raise Exception(f"Failed to fetch latest release info: {response.status_code}")
267
+
268
+ latest_release_data = response.json()
269
+ tag_name = latest_release_data['tag_name']
270
+
271
+ # Get the release details using the tag name
272
+ release_details_url = f"https://api.github.com/repos/{repo}/releases/tags/{tag_name}"
273
+ response = requests.get(release_details_url)
274
+ if response.status_code != 200:
275
+ raise Exception(f"Failed to fetch release details for tag {tag_name}: {response.status_code}")
276
+
277
+ release_data = response.json()
278
+ assets = release_data.get('assets', [])
279
+
280
+ # Find the asset with the specified prefix
281
+ asset_url = None
282
+ for asset in assets:
283
+ if re.match(f"{asset_name_prefix}.*", asset['name']):
284
+ asset_url = asset['browser_download_url']
285
+ break
286
+
287
+ if not asset_url:
288
+ raise Exception(f"No asset found with prefix {asset_name_prefix}")
289
+
290
+ # Download the asset
291
+ response = requests.get(asset_url)
292
+ if response.status_code != 200:
293
+ raise Exception(f"Failed to download asset: {response.status_code}")
294
+
295
+ print("Llamafile downloaded successfully.")
296
+ logging.debug("Main: Llamafile downloaded successfully.")
297
+
298
+ # Save the file
299
+ with open(output_filename, 'wb') as file:
300
+ file.write(response.content)
301
+
302
+ logging.debug(f"Downloaded {output_filename} from {asset_url}")
303
+ print(f"Downloaded {output_filename} from {asset_url}")
304
+
305
+ # Check to see if the LLM already exists, and if not, download the LLM
306
+ print("Checking for and downloading LLM from Huggingface if needed...")
307
+ logging.debug("Main: Checking and downloading LLM from Huggingface if needed...")
308
+ mistral_7b_instruct_v0_2_q8_0_llamafile = "mistral-7b-instruct-v0.2.Q8_0.llamafile"
309
+ Samantha_Mistral_Instruct_7B_Bulleted_Notes_Q8 = "samantha-mistral-instruct-7b-bulleted-notes.Q8_0.gguf"
310
+ Phi_3_mini_128k_instruct_Q8_0_gguf = "Phi-3-mini-128k-instruct-Q8_0.gguf"
311
+ if os.path.exists(mistral_7b_instruct_v0_2_q8_0_llamafile):
312
+ llamafile_llm_url = "https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q8_0.llamafile?download=true"
313
+ print("Model is already downloaded. Skipping download.")
314
+ pass
315
+ elif os.path.exists(Samantha_Mistral_Instruct_7B_Bulleted_Notes_Q8):
316
+ llamafile_llm_url = "https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q8_0.llamafile?download=true"
317
+ print("Model is already downloaded. Skipping download.")
318
+ pass
319
+ elif os.path.exists(mistral_7b_instruct_v0_2_q8_0_llamafile):
320
+ llamafile_llm_url = "https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q8_0.llamafile?download=true"
321
+ print("Model is already downloaded. Skipping download.")
322
+ pass
323
+ else:
324
+ logging.debug("Main: Checking and downloading LLM from Huggingface if needed...")
325
+ print("Downloading LLM from Huggingface...")
326
+ time.sleep(1)
327
+ print("Gonna be a bit...")
328
+ time.sleep(1)
329
+ print("Like seriously, an 8GB file...")
330
+ time.sleep(2)
331
+ dl_check = input("Final chance to back out, hit 'N'/'n' to cancel, or 'Y'/'y' to continue: ")
332
+ if dl_check == "N" or dl_check == "n":
333
+ exit()
334
+ else:
335
+ print("Downloading LLM from Huggingface...")
336
+ # Establish hash values for LLM models
337
+ mistral_7b_instruct_v0_2_q8_gguf_sha256 = "f326f5f4f137f3ad30f8c9cc21d4d39e54476583e8306ee2931d5a022cb85b06"
338
+ samantha_mistral_instruct_7b_bulleted_notes_q8_0_gguf_sha256 = "6334c1ab56c565afd86535271fab52b03e67a5e31376946bce7bf5c144e847e4"
339
+ mistral_7b_instruct_v0_2_q8_0_llamafile_sha256 = "1ee6114517d2f770425c880e5abc443da36b193c82abec8e2885dd7ce3b9bfa6"
340
+
341
+ # FIXME - llm_choice
342
+ llm_choice = 2
343
+ llm_choice = input("Which LLM model would you like to download? 1. Mistral-7B-Instruct-v0.2-GGUF or 2. Samantha-Mistral-Instruct-7B-Bulleted-Notes) (plain or 'custom') or MS Flavor: Phi-3-mini-128k-instruct-Q8_0.gguf \n\n\tPress '1' or '2' or '3' to specify: ")
344
+ while llm_choice != "1" and llm_choice != "2" and llm_choice != "3":
345
+ print("Invalid choice. Please try again.")
346
+ if llm_choice == "1":
347
+ llm_download_model = "Mistral-7B-Instruct-v0.2-Q8.llamafile"
348
+ mistral_7b_instruct_v0_2_q8_0_llamafile_sha256 = "1ee6114517d2f770425c880e5abc443da36b193c82abec8e2885dd7ce3b9bfa6"
349
+ llm_download_model_hash = mistral_7b_instruct_v0_2_q8_0_llamafile_sha256
350
+ llamafile_llm_url = "https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q8_0.llamafile?download=true"
351
+ llamafile_llm_output_filename = "mistral-7b-instruct-v0.2.Q8_0.llamafile"
352
+ download_file(llamafile_llm_url, llamafile_llm_output_filename, llm_download_model_hash)
353
+ elif llm_choice == "2":
354
+ llm_download_model = "Samantha-Mistral-Instruct-7B-Bulleted-Notes-Q8.gguf"
355
+ samantha_mistral_instruct_7b_bulleted_notes_q8_0_gguf_sha256 = "6334c1ab56c565afd86535271fab52b03e67a5e31376946bce7bf5c144e847e4"
356
+ llm_download_model_hash = samantha_mistral_instruct_7b_bulleted_notes_q8_0_gguf_sha256
357
+ llamafile_llm_output_filename = "samantha-mistral-instruct-7b-bulleted-notes.Q8_0.gguf"
358
+ llamafile_llm_url = "https://huggingface.co/cognitivetech/samantha-mistral-instruct-7b_bulleted-notes_GGUF/resolve/main/samantha-mistral-instruct-7b-bulleted-notes.Q8_0.gguf?download=true"
359
+ download_file(llamafile_llm_url, llamafile_llm_output_filename, llm_download_model_hash)
360
+ elif llm_choice == "3":
361
+ llm_download_model = "Phi-3-mini-128k-instruct-Q8_0.gguf"
362
+ Phi_3_mini_128k_instruct_Q8_0_gguf_sha256 = "6817b66d1c3c59ab06822e9732f0e594eea44e64cae2110906eac9d17f75d193"
363
+ llm_download_model_hash = Phi_3_mini_128k_instruct_Q8_0_gguf_sha256
364
+ llamafile_llm_output_filename = "Phi-3-mini-128k-instruct-Q8_0.gguf"
365
+ llamafile_llm_url = "https://huggingface.co/gaianet/Phi-3-mini-128k-instruct-GGUF/resolve/main/Phi-3-mini-128k-instruct-Q8_0.gguf?download=true"
366
+ download_file(llamafile_llm_url, llamafile_llm_output_filename, llm_download_model_hash)
367
+ elif llm_choice == "4": # FIXME - and meta_Llama_3_8B_Instruct_Q8_0_llamafile_exists == False:
368
+ meta_Llama_3_8B_Instruct_Q8_0_llamafile_sha256 = "406868a97f02f57183716c7e4441d427f223fdbc7fa42964ef10c4d60dd8ed37"
369
+ llm_download_model_hash = meta_Llama_3_8B_Instruct_Q8_0_llamafile_sha256
370
+ llamafile_llm_output_filename = " Meta-Llama-3-8B-Instruct.Q8_0.llamafile"
371
+ llamafile_llm_url = "https://huggingface.co/Mozilla/Meta-Llama-3-8B-Instruct-llamafile/resolve/main/Meta-Llama-3-8B-Instruct.Q8_0.llamafile?download=true"
372
+ else:
373
+ print("Invalid choice. Please try again.")
374
+ return output_filename
375
+
376
+
377
+
378
+
379
+ # FIXME / IMPLEMENT FULLY
380
+ # File download verification
381
+ #mistral_7b_llamafile_instruct_v02_q8_url = "https://huggingface.co/Mozilla/Mistral-7B-Instruct-v0.2-llamafile/resolve/main/mistral-7b-instruct-v0.2.Q8_0.llamafile?download=true"
382
+ #global mistral_7b_instruct_v0_2_q8_0_llamafile_sha256
383
+ #mistral_7b_instruct_v0_2_q8_0_llamafile_sha256 = "1ee6114517d2f770425c880e5abc443da36b193c82abec8e2885dd7ce3b9bfa6"
384
+
385
+ #mistral_7b_v02_instruct_model_q8_gguf_url = "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.2-GGUF/resolve/main/mistral-7b-instruct-v0.2.Q8_0.gguf?download=true"
386
+ #global mistral_7b_instruct_v0_2_q8_gguf_sha256
387
+ #mistral_7b_instruct_v0_2_q8_gguf_sha256 = "f326f5f4f137f3ad30f8c9cc21d4d39e54476583e8306ee2931d5a022cb85b06"
388
+
389
+ #samantha_instruct_model_q8_gguf_url = "https://huggingface.co/cognitivetech/samantha-mistral-instruct-7b_bulleted-notes_GGUF/resolve/main/samantha-mistral-instruct-7b-bulleted-notes.Q8_0.gguf?download=true"
390
+ #global samantha_mistral_instruct_7b_bulleted_notes_q8_0_gguf_sha256
391
+ #samantha_mistral_instruct_7b_bulleted_notes_q8_0_gguf_sha256 = "6334c1ab56c565afd86535271fab52b03e67a5e31376946bce7bf5c144e847e4"
392
+
393
+
394
+ process = None
395
+ # Function to close out llamafile process on script exit.
396
+ def cleanup_process():
397
+ global process
398
+ if process is not None:
399
+ process.kill()
400
+ logging.debug("Main: Terminated the external process")
401
+
402
+
403
+ def signal_handler(sig, frame):
404
+ logging.info('Signal handler called with signal: %s', sig)
405
+ cleanup_process()
406
+ sys.exit(0)
407
+
408
+
409
+ # FIXME - Add callout to gradio UI
410
+ def local_llm_function():
411
+ global process
412
+ repo = "Mozilla-Ocho/llamafile"
413
+ asset_name_prefix = "llamafile-"
414
+ useros = os.name
415
+ if useros == "nt":
416
+ output_filename = "llamafile.exe"
417
+ else:
418
+ output_filename = "llamafile"
419
+ print(
420
+ "WARNING - Checking for existence of llamafile and HuggingFace model, downloading if needed...This could be a while")
421
+ print("WARNING - and I mean a while. We're talking an 8 Gigabyte model here...")
422
+ print("WARNING - Hope you're comfy. Or it's already downloaded.")
423
+ time.sleep(6)
424
+ logging.debug("Main: Checking and downloading Llamafile from Github if needed...")
425
+ llamafile_path = download_latest_llamafile(repo, asset_name_prefix, output_filename)
426
+ logging.debug("Main: Llamafile downloaded successfully.")
427
+
428
+ # FIXME - llm_choice
429
+ global llm_choice
430
+ llm_choice = 1
431
+ # Launch the llamafile in an external process with the specified argument
432
+ if llm_choice == 1:
433
+ arguments = ["--ctx-size", "8192 ", " -m", "mistral-7b-instruct-v0.2.Q8_0.llamafile"]
434
+ elif llm_choice == 2:
435
+ arguments = ["--ctx-size", "8192 ", " -m", "samantha-mistral-instruct-7b-bulleted-notes.Q8_0.gguf"]
436
+ elif llm_choice == 3:
437
+ arguments = ["--ctx-size", "8192 ", " -m", "Phi-3-mini-128k-instruct-Q8_0.gguf"]
438
+ elif llm_choice == 4:
439
+ arguments = ["--ctx-size", "8192 ", " -m", "llama-3"] # FIXME
440
+
441
+ try:
442
+ logging.info("Main: Launching the LLM (llamafile) in an external terminal window...")
443
+ if useros == "nt":
444
+ launch_in_new_terminal_windows(llamafile_path, arguments)
445
+ elif useros == "posix":
446
+ launch_in_new_terminal_linux(llamafile_path, arguments)
447
+ else:
448
+ launch_in_new_terminal_mac(llamafile_path, arguments)
449
+ # FIXME - pid doesn't exist in this context
450
+ #logging.info(f"Main: Launched the {llamafile_path} with PID {process.pid}")
451
+ atexit.register(cleanup_process, process)
452
+ except Exception as e:
453
+ logging.error(f"Failed to launch the process: {e}")
454
+ print(f"Failed to launch the process: {e}")
455
+
456
+
457
+ # This function is used to dl a llamafile binary + the Samantha Mistral Finetune model.
458
+ # It should only be called when the user is using the GUI to set up and interact with Llamafile.
459
+ def local_llm_gui_function(am_noob, verbose_checked, threads_checked, threads_value, http_threads_checked, http_threads_value,
460
+ model_checked, model_value, hf_repo_checked, hf_repo_value, hf_file_checked, hf_file_value,
461
+ ctx_size_checked, ctx_size_value, ngl_checked, ngl_value, host_checked, host_value, port_checked,
462
+ port_value):
463
+ # Identify running OS
464
+ useros = os.name
465
+ if useros == "nt":
466
+ output_filename = "llamafile.exe"
467
+ else:
468
+ output_filename = "llamafile"
469
+
470
+ # Build up the commands for llamafile
471
+ built_up_args = []
472
+
473
+ # Identify if the user wants us to do everything for them
474
+ if am_noob == True:
475
+ print("You're a noob. (lol j/k; they're good settings)")
476
+
477
+ # Setup variables for Model download from HF
478
+ repo = "Mozilla-Ocho/llamafile"
479
+ asset_name_prefix = "llamafile-"
480
+ print(
481
+ "WARNING - Checking for existence of llamafile or HuggingFace model (GGUF type), downloading if needed...This could be a while")
482
+ print("WARNING - and I mean a while. We're talking an 8 Gigabyte model here...")
483
+ print("WARNING - Hope you're comfy. Or it's already downloaded.")
484
+ time.sleep(6)
485
+ logging.debug("Main: Checking for Llamafile and downloading from Github if needed...\n\tAlso checking for a "
486
+ "local LLM model...\n\tDownloading if needed...\n\tThis could take a while...\n\tWill be the "
487
+ "'samantha-mistral-instruct-7b-bulleted-notes.Q8_0.gguf' model...")
488
+ llamafile_path = download_latest_llamafile_through_gui(repo, asset_name_prefix, output_filename)
489
+ logging.debug("Main: Llamafile downloaded successfully.")
490
+
491
+ arguments = []
492
+ # FIXME - llm_choice
493
+ # This is the gui, we can add this as options later
494
+ llm_choice = 2
495
+ # Launch the llamafile in an external process with the specified argument
496
+ if llm_choice == 1:
497
+ arguments = ["--ctx-size", "8192 ", " -m", "mistral-7b-instruct-v0.2.Q8_0.llamafile"]
498
+ elif llm_choice == 2:
499
+ arguments = """--ctx-size 8192 -m samantha-mistral-instruct-7b-bulleted-notes.Q8_0.gguf"""
500
+ elif llm_choice == 3:
501
+ arguments = ["--ctx-size", "8192 ", " -m", "Phi-3-mini-128k-instruct-Q8_0.gguf"]
502
+ elif llm_choice == 4:
503
+ arguments = ["--ctx-size", "8192 ", " -m", "llama-3"]
504
+
505
+ try:
506
+ logging.info("Main(Local-LLM-GUI-noob): Launching the LLM (llamafile) in an external terminal window...")
507
+
508
+ if useros == "nt":
509
+ command = 'start cmd /k "llamafile.exe --ctx-size 8192 -m samantha-mistral-instruct-7b-bulleted-notes.Q8_0.gguf"'
510
+ subprocess.Popen(command, shell=True)
511
+ elif useros == "posix":
512
+ command = "llamafile --ctx-size 8192 -m samantha-mistral-instruct-7b-bulleted-notes.Q8_0.gguf"
513
+ subprocess.Popen(command, shell=True)
514
+ else:
515
+ command = "llamafile.exe --ctx-size 8192 -m samantha-mistral-instruct-7b-bulleted-notes.Q8_0.gguf"
516
+ subprocess.Popen(command, shell=True)
517
+ # FIXME - pid doesn't exist in this context
518
+ # logging.info(f"Main: Launched the {llamafile_path} with PID {process.pid}")
519
+ atexit.register(cleanup_process, process)
520
+ except Exception as e:
521
+ logging.error(f"Failed to launch the process: {e}")
522
+ print(f"Failed to launch the process: {e}")
523
+
524
+ else:
525
+ print("You're not a noob.")
526
+ llamafile_path = download_latest_llamafile_no_model(output_filename)
527
+ if verbose_checked == True:
528
+ print("Verbose mode enabled.")
529
+ built_up_args.append("--verbose")
530
+ if threads_checked == True:
531
+ print(f"Threads enabled with value: {threads_value}")
532
+ built_up_args.append(f"--threads {threads_value}")
533
+ if http_threads_checked == True:
534
+ print(f"HTTP Threads enabled with value: {http_threads_value}")
535
+ built_up_args.append(f"--http-threads {http_threads_value}")
536
+ if model_checked == True:
537
+ print(f"Model enabled with value: {model_value}")
538
+ built_up_args.append(f"--model {model_value}")
539
+ if hf_repo_checked == True:
540
+ print(f"Huggingface repo enabled with value: {hf_repo_value}")
541
+ built_up_args.append(f"--hf-repo {hf_repo_value}")
542
+ if hf_file_checked == True:
543
+ print(f"Huggingface file enabled with value: {hf_file_value}")
544
+ built_up_args.append(f"--hf-file {hf_file_value}")
545
+ if ctx_size_checked == True:
546
+ print(f"Context size enabled with value: {ctx_size_value}")
547
+ built_up_args.append(f"--ctx-size {ctx_size_value}")
548
+ if ngl_checked == True:
549
+ print(f"NGL enabled with value: {ngl_value}")
550
+ built_up_args.append(f"--ngl {ngl_value}")
551
+ if host_checked == True:
552
+ print(f"Host enabled with value: {host_value}")
553
+ built_up_args.append(f"--host {host_value}")
554
+ if port_checked == True:
555
+ print(f"Port enabled with value: {port_value}")
556
+ built_up_args.append(f"--port {port_value}")
557
+
558
+ # Lets go ahead and finally launch the bastard...
559
+ try:
560
+ logging.info("Main(Local-LLM-GUI-Main): Launching the LLM (llamafile) in an external terminal window...")
561
+ if useros == "nt":
562
+ launch_in_new_terminal_windows(llamafile_path, built_up_args)
563
+ elif useros == "posix":
564
+ launch_in_new_terminal_linux(llamafile_path, built_up_args)
565
+ else:
566
+ launch_in_new_terminal_mac(llamafile_path, built_up_args)
567
+ # FIXME - pid doesn't exist in this context
568
+ #logging.info(f"Main: Launched the {llamafile_path} with PID {process.pid}")
569
+ atexit.register(cleanup_process, process)
570
+ except Exception as e:
571
+ logging.error(f"Failed to launch the process: {e}")
572
+ print(f"Failed to launch the process: {e}")
573
+
574
+
575
+ # Launch the executable in a new terminal window # FIXME - really should figure out a cleaner way of doing this...
576
+ def launch_in_new_terminal_windows(executable, args):
577
+ command = f'start cmd /k "{executable} {" ".join(args)}"'
578
+ subprocess.Popen(command, shell=True)
579
+
580
+
581
+ # FIXME
582
+ def launch_in_new_terminal_linux(executable, args):
583
+ command = f'gnome-terminal -- {executable} {" ".join(args)}'
584
+ subprocess.Popen(command, shell=True)
585
+
586
+
587
+ # FIXME
588
+ def launch_in_new_terminal_mac(executable, args):
589
+ command = f'open -a Terminal.app {executable} {" ".join(args)}'
590
+ subprocess.Popen(command, shell=True)
App_Function_Libraries/Local_Summarization_Lib.py ADDED
@@ -0,0 +1,467 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Local_Summarization_Lib.py
2
+ #########################################
3
+ # Local Summarization Library
4
+ # This library is used to perform summarization with a 'local' inference engine.
5
+ #
6
+ ####
7
+ #
8
+ ####################
9
+ # Function List
10
+ # FIXME - UPDATE Function Arguments
11
+ # 1. summarize_with_local_llm(text, custom_prompt_arg)
12
+ # 2. summarize_with_llama(api_url, text, token, custom_prompt)
13
+ # 3. summarize_with_kobold(api_url, text, kobold_api_token, custom_prompt)
14
+ # 4. summarize_with_oobabooga(api_url, text, ooba_api_token, custom_prompt)
15
+ # 5. summarize_with_vllm(vllm_api_url, vllm_api_key_function_arg, llm_model, text, vllm_custom_prompt_function_arg)
16
+ # 6. summarize_with_tabbyapi(tabby_api_key, tabby_api_IP, text, tabby_model, custom_prompt)
17
+ # 7. save_summary_to_file(summary, file_path)
18
+ #
19
+ ###############################
20
+ # Import necessary libraries
21
+ import json
22
+ import logging
23
+ import os
24
+ import requests
25
+ # Import 3rd-party Libraries
26
+ from openai import OpenAI
27
+ # Import Local
28
+ from App_Function_Libraries.Utils import load_and_log_configs
29
+ from App_Function_Libraries.Utils import extract_text_from_segments
30
+ #
31
+ #######################################################################################################################
32
+ # Function Definitions
33
+ #
34
+
35
+ logger = logging.getLogger()
36
+
37
+ # Dirty hack for vLLM
38
+ openai_api_key = "Fake_key"
39
+ client = OpenAI(api_key=openai_api_key)
40
+
41
+ def summarize_with_local_llm(input_data, custom_prompt_arg):
42
+ try:
43
+ if isinstance(input_data, str) and os.path.isfile(input_data):
44
+ logging.debug("Local LLM: Loading json data for summarization")
45
+ with open(input_data, 'r') as file:
46
+ data = json.load(file)
47
+ else:
48
+ logging.debug("openai: Using provided string data for summarization")
49
+ data = input_data
50
+
51
+ logging.debug(f"Local LLM: Loaded data: {data}")
52
+ logging.debug(f"Local LLM: Type of data: {type(data)}")
53
+
54
+ if isinstance(data, dict) and 'summary' in data:
55
+ # If the loaded data is a dictionary and already contains a summary, return it
56
+ logging.debug("Local LLM: Summary already exists in the loaded data")
57
+ return data['summary']
58
+
59
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
60
+ if isinstance(data, list):
61
+ segments = data
62
+ text = extract_text_from_segments(segments)
63
+ elif isinstance(data, str):
64
+ text = data
65
+ else:
66
+ raise ValueError("Invalid input data format")
67
+
68
+ headers = {
69
+ 'Content-Type': 'application/json'
70
+ }
71
+
72
+ logging.debug("Local LLM: Preparing data + prompt for submittal")
73
+ local_llm_prompt = f"{text} \n\n\n\n{custom_prompt_arg}"
74
+ data = {
75
+ "messages": [
76
+ {
77
+ "role": "system",
78
+ "content": "You are a professional summarizer."
79
+ },
80
+ {
81
+ "role": "user",
82
+ "content": local_llm_prompt
83
+ }
84
+ ],
85
+ "max_tokens": 28000, # Adjust tokens as needed
86
+ }
87
+ logging.debug("Local LLM: Posting request")
88
+ response = requests.post('http://127.0.0.1:8080/v1/chat/completions', headers=headers, json=data)
89
+
90
+ if response.status_code == 200:
91
+ response_data = response.json()
92
+ if 'choices' in response_data and len(response_data['choices']) > 0:
93
+ summary = response_data['choices'][0]['message']['content'].strip()
94
+ logging.debug("Local LLM: Summarization successful")
95
+ print("Local LLM: Summarization successful.")
96
+ return summary
97
+ else:
98
+ logging.warning("Local LLM: Summary not found in the response data")
99
+ return "Local LLM: Summary not available"
100
+ else:
101
+ logging.debug("Local LLM: Summarization failed")
102
+ print("Local LLM: Failed to process summary:", response.text)
103
+ return "Local LLM: Failed to process summary"
104
+ except Exception as e:
105
+ logging.debug("Local LLM: Error in processing: %s", str(e))
106
+ print("Error occurred while processing summary with Local LLM:", str(e))
107
+ return "Local LLM: Error occurred while processing summary"
108
+
109
+ def summarize_with_llama(input_data, custom_prompt, api_url="http://127.0.0.1:8080/completion", api_key=None):
110
+ loaded_config_data = load_and_log_configs()
111
+ try:
112
+ # API key validation
113
+ if api_key is None:
114
+ logging.info("llama.cpp: API key not provided as parameter")
115
+ logging.info("llama.cpp: Attempting to use API key from config file")
116
+ api_key = loaded_config_data['api_keys']['llama']
117
+
118
+ if api_key is None or api_key.strip() == "":
119
+ logging.info("llama.cpp: API key not found or is empty")
120
+
121
+ logging.debug(f"llama.cpp: Using API Key: {api_key[:5]}...{api_key[-5:]}")
122
+
123
+ # Load transcript
124
+ logging.debug("llama.cpp: Loading JSON data")
125
+ if isinstance(input_data, str) and os.path.isfile(input_data):
126
+ logging.debug("Llama.cpp: Loading json data for summarization")
127
+ with open(input_data, 'r') as file:
128
+ data = json.load(file)
129
+ else:
130
+ logging.debug("Llama.cpp: Using provided string data for summarization")
131
+ data = input_data
132
+
133
+ logging.debug(f"Llama.cpp: Loaded data: {data}")
134
+ logging.debug(f"Llama.cpp: Type of data: {type(data)}")
135
+
136
+ if isinstance(data, dict) and 'summary' in data:
137
+ # If the loaded data is a dictionary and already contains a summary, return it
138
+ logging.debug("Llama.cpp: Summary already exists in the loaded data")
139
+ return data['summary']
140
+
141
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
142
+ if isinstance(data, list):
143
+ segments = data
144
+ text = extract_text_from_segments(segments)
145
+ elif isinstance(data, str):
146
+ text = data
147
+ else:
148
+ raise ValueError("Llama.cpp: Invalid input data format")
149
+
150
+ headers = {
151
+ 'accept': 'application/json',
152
+ 'content-type': 'application/json',
153
+ }
154
+ if len(api_key) > 5:
155
+ headers['Authorization'] = f'Bearer {api_key}'
156
+
157
+ llama_prompt = f"{text} \n\n\n\n{custom_prompt}"
158
+ logging.debug("llama: Prompt being sent is {llama_prompt}")
159
+
160
+ data = {
161
+ "prompt": llama_prompt
162
+ }
163
+
164
+ logging.debug("llama: Submitting request to API endpoint")
165
+ print("llama: Submitting request to API endpoint")
166
+ response = requests.post(api_url, headers=headers, json=data)
167
+ response_data = response.json()
168
+ logging.debug("API Response Data: %s", response_data)
169
+
170
+ if response.status_code == 200:
171
+ # if 'X' in response_data:
172
+ logging.debug(response_data)
173
+ summary = response_data['content'].strip()
174
+ logging.debug("llama: Summarization successful")
175
+ print("Summarization successful.")
176
+ return summary
177
+ else:
178
+ logging.error(f"Llama: API request failed with status code {response.status_code}: {response.text}")
179
+ return f"Llama: API request failed: {response.text}"
180
+
181
+ except Exception as e:
182
+ logging.error("Llama: Error in processing: %s", str(e))
183
+ return f"Llama: Error occurred while processing summary with llama: {str(e)}"
184
+
185
+
186
+ # https://lite.koboldai.net/koboldcpp_api#/api%2Fv1/post_api_v1_generate
187
+ def summarize_with_kobold(input_data, api_key, custom_prompt_input, kobold_api_IP="http://127.0.0.1:5001/api/v1/generate"):
188
+ loaded_config_data = load_and_log_configs()
189
+ try:
190
+ # API key validation
191
+ if api_key is None:
192
+ logging.info("Kobold.cpp: API key not provided as parameter")
193
+ logging.info("Kobold.cpp: Attempting to use API key from config file")
194
+ api_key = loaded_config_data['api_keys']['kobold']
195
+
196
+ if api_key is None or api_key.strip() == "":
197
+ logging.info("Kobold.cpp: API key not found or is empty")
198
+
199
+ if isinstance(input_data, str) and os.path.isfile(input_data):
200
+ logging.debug("Kobold.cpp: Loading json data for summarization")
201
+ with open(input_data, 'r') as file:
202
+ data = json.load(file)
203
+ else:
204
+ logging.debug("Kobold.cpp: Using provided string data for summarization")
205
+ data = input_data
206
+
207
+ logging.debug(f"Kobold.cpp: Loaded data: {data}")
208
+ logging.debug(f"Kobold.cpp: Type of data: {type(data)}")
209
+
210
+ if isinstance(data, dict) and 'summary' in data:
211
+ # If the loaded data is a dictionary and already contains a summary, return it
212
+ logging.debug("Kobold.cpp: Summary already exists in the loaded data")
213
+ return data['summary']
214
+
215
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
216
+ if isinstance(data, list):
217
+ segments = data
218
+ text = extract_text_from_segments(segments)
219
+ elif isinstance(data, str):
220
+ text = data
221
+ else:
222
+ raise ValueError("Kobold.cpp: Invalid input data format")
223
+
224
+ headers = {
225
+ 'accept': 'application/json',
226
+ 'content-type': 'application/json',
227
+ }
228
+
229
+ kobold_prompt = f"{text} \n\n\n\n{custom_prompt_input}"
230
+ logging.debug("kobold: Prompt being sent is {kobold_prompt}")
231
+
232
+ # FIXME
233
+ # Values literally c/p from the api docs....
234
+ data = {
235
+ "max_context_length": 8096,
236
+ "max_length": 4096,
237
+ "prompt": f"{text}\n\n\n\n{custom_prompt_input}"
238
+ }
239
+
240
+ logging.debug("kobold: Submitting request to API endpoint")
241
+ print("kobold: Submitting request to API endpoint")
242
+ response = requests.post(kobold_api_IP, headers=headers, json=data)
243
+ response_data = response.json()
244
+ logging.debug("kobold: API Response Data: %s", response_data)
245
+
246
+ if response.status_code == 200:
247
+ if 'results' in response_data and len(response_data['results']) > 0:
248
+ summary = response_data['results'][0]['text'].strip()
249
+ logging.debug("kobold: Summarization successful")
250
+ print("Summarization successful.")
251
+ return summary
252
+ else:
253
+ logging.error("Expected data not found in API response.")
254
+ return "Expected data not found in API response."
255
+ else:
256
+ logging.error(f"kobold: API request failed with status code {response.status_code}: {response.text}")
257
+ return f"kobold: API request failed: {response.text}"
258
+
259
+ except Exception as e:
260
+ logging.error("kobold: Error in processing: %s", str(e))
261
+ return f"kobold: Error occurred while processing summary with kobold: {str(e)}"
262
+
263
+
264
+ # https://github.com/oobabooga/text-generation-webui/wiki/12-%E2%80%90-OpenAI-API
265
+ def summarize_with_oobabooga(input_data, api_key, custom_prompt, api_url="http://127.0.0.1:5000/v1/chat/completions"):
266
+ loaded_config_data = load_and_log_configs()
267
+ try:
268
+ # API key validation
269
+ if api_key is None:
270
+ logging.info("ooba: API key not provided as parameter")
271
+ logging.info("ooba: Attempting to use API key from config file")
272
+ api_key = loaded_config_data['api_keys']['ooba']
273
+
274
+ if api_key is None or api_key.strip() == "":
275
+ logging.info("ooba: API key not found or is empty")
276
+
277
+ if isinstance(input_data, str) and os.path.isfile(input_data):
278
+ logging.debug("Oobabooga: Loading json data for summarization")
279
+ with open(input_data, 'r') as file:
280
+ data = json.load(file)
281
+ else:
282
+ logging.debug("Oobabooga: Using provided string data for summarization")
283
+ data = input_data
284
+
285
+ logging.debug(f"Oobabooga: Loaded data: {data}")
286
+ logging.debug(f"Oobabooga: Type of data: {type(data)}")
287
+
288
+ if isinstance(data, dict) and 'summary' in data:
289
+ # If the loaded data is a dictionary and already contains a summary, return it
290
+ logging.debug("Oobabooga: Summary already exists in the loaded data")
291
+ return data['summary']
292
+
293
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
294
+ if isinstance(data, list):
295
+ segments = data
296
+ text = extract_text_from_segments(segments)
297
+ elif isinstance(data, str):
298
+ text = data
299
+ else:
300
+ raise ValueError("Invalid input data format")
301
+
302
+ headers = {
303
+ 'accept': 'application/json',
304
+ 'content-type': 'application/json',
305
+ }
306
+
307
+ # prompt_text = "I like to eat cake and bake cakes. I am a baker. I work in a French bakery baking cakes. It
308
+ # is a fun job. I have been baking cakes for ten years. I also bake lots of other baked goods, but cakes are
309
+ # my favorite." prompt_text += f"\n\n{text}" # Uncomment this line if you want to include the text variable
310
+ ooba_prompt = f"{text}" + f"\n\n\n\n{custom_prompt}"
311
+ logging.debug("ooba: Prompt being sent is {ooba_prompt}")
312
+
313
+ data = {
314
+ "mode": "chat",
315
+ "character": "Example",
316
+ "messages": [{"role": "user", "content": ooba_prompt}]
317
+ }
318
+
319
+ logging.debug("ooba: Submitting request to API endpoint")
320
+ print("ooba: Submitting request to API endpoint")
321
+ response = requests.post(api_url, headers=headers, json=data, verify=False)
322
+ logging.debug("ooba: API Response Data: %s", response)
323
+
324
+ if response.status_code == 200:
325
+ response_data = response.json()
326
+ summary = response.json()['choices'][0]['message']['content']
327
+ logging.debug("ooba: Summarization successful")
328
+ print("Summarization successful.")
329
+ return summary
330
+ else:
331
+ logging.error(f"oobabooga: API request failed with status code {response.status_code}: {response.text}")
332
+ return f"ooba: API request failed with status code {response.status_code}: {response.text}"
333
+
334
+ except Exception as e:
335
+ logging.error("ooba: Error in processing: %s", str(e))
336
+ return f"ooba: Error occurred while processing summary with oobabooga: {str(e)}"
337
+
338
+
339
+ # FIXME - Install is more trouble than care to deal with right now.
340
+ def summarize_with_tabbyapi(input_data, custom_prompt_input, api_key=None, api_IP="http://127.0.0.1:5000/v1/chat/completions"):
341
+ loaded_config_data = load_and_log_configs()
342
+ model = loaded_config_data['models']['tabby']
343
+ # API key validation
344
+ if api_key is None:
345
+ logging.info("tabby: API key not provided as parameter")
346
+ logging.info("tabby: Attempting to use API key from config file")
347
+ api_key = loaded_config_data['api_keys']['tabby']
348
+
349
+ if api_key is None or api_key.strip() == "":
350
+ logging.info("tabby: API key not found or is empty")
351
+
352
+ if isinstance(input_data, str) and os.path.isfile(input_data):
353
+ logging.debug("tabby: Loading json data for summarization")
354
+ with open(input_data, 'r') as file:
355
+ data = json.load(file)
356
+ else:
357
+ logging.debug("tabby: Using provided string data for summarization")
358
+ data = input_data
359
+
360
+ logging.debug(f"tabby: Loaded data: {data}")
361
+ logging.debug(f"tabby: Type of data: {type(data)}")
362
+
363
+ if isinstance(data, dict) and 'summary' in data:
364
+ # If the loaded data is a dictionary and already contains a summary, return it
365
+ logging.debug("tabby: Summary already exists in the loaded data")
366
+ return data['summary']
367
+
368
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
369
+ if isinstance(data, list):
370
+ segments = data
371
+ text = extract_text_from_segments(segments)
372
+ elif isinstance(data, str):
373
+ text = data
374
+ else:
375
+ raise ValueError("Invalid input data format")
376
+
377
+ headers = {
378
+ 'Authorization': f'Bearer {api_key}',
379
+ 'Content-Type': 'application/json'
380
+ }
381
+ data2 = {
382
+ 'text': text,
383
+ 'model': 'tabby' # Specify the model if needed
384
+ }
385
+ tabby_api_ip = loaded_config_data['local_apis']['tabby']['ip']
386
+ try:
387
+ response = requests.post(tabby_api_ip, headers=headers, json=data2)
388
+ response.raise_for_status()
389
+ summary = response.json().get('summary', '')
390
+ return summary
391
+ except requests.exceptions.RequestException as e:
392
+ logger.error(f"Error summarizing with TabbyAPI: {e}")
393
+ return "Error summarizing with TabbyAPI."
394
+
395
+
396
+ # FIXME - https://docs.vllm.ai/en/latest/getting_started/quickstart.html .... Great docs.
397
+ def summarize_with_vllm(input_data, custom_prompt_input, api_key=None, vllm_api_url="http://127.0.0.1:8000/v1/chat/completions"):
398
+ loaded_config_data = load_and_log_configs()
399
+ llm_model = loaded_config_data['models']['vllm']
400
+ # API key validation
401
+ if api_key is None:
402
+ logging.info("vLLM: API key not provided as parameter")
403
+ logging.info("vLLM: Attempting to use API key from config file")
404
+ api_key = loaded_config_data['api_keys']['llama']
405
+
406
+ if api_key is None or api_key.strip() == "":
407
+ logging.info("vLLM: API key not found or is empty")
408
+ vllm_client = OpenAI(
409
+ base_url=vllm_api_url,
410
+ api_key=custom_prompt_input
411
+ )
412
+
413
+ if isinstance(input_data, str) and os.path.isfile(input_data):
414
+ logging.debug("vLLM: Loading json data for summarization")
415
+ with open(input_data, 'r') as file:
416
+ data = json.load(file)
417
+ else:
418
+ logging.debug("vLLM: Using provided string data for summarization")
419
+ data = input_data
420
+
421
+ logging.debug(f"vLLM: Loaded data: {data}")
422
+ logging.debug(f"vLLM: Type of data: {type(data)}")
423
+
424
+ if isinstance(data, dict) and 'summary' in data:
425
+ # If the loaded data is a dictionary and already contains a summary, return it
426
+ logging.debug("vLLM: Summary already exists in the loaded data")
427
+ return data['summary']
428
+
429
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
430
+ if isinstance(data, list):
431
+ segments = data
432
+ text = extract_text_from_segments(segments)
433
+ elif isinstance(data, str):
434
+ text = data
435
+ else:
436
+ raise ValueError("Invalid input data format")
437
+
438
+
439
+ custom_prompt = custom_prompt_input
440
+
441
+ completion = client.chat.completions.create(
442
+ model=llm_model,
443
+ messages=[
444
+ {"role": "system", "content": "You are a professional summarizer."},
445
+ {"role": "user", "content": f"{text} \n\n\n\n{custom_prompt}"}
446
+ ]
447
+ )
448
+ vllm_summary = completion.choices[0].message.content
449
+ return vllm_summary
450
+
451
+
452
+ def save_summary_to_file(summary, file_path):
453
+ logging.debug("Now saving summary to file...")
454
+ base_name = os.path.splitext(os.path.basename(file_path))[0]
455
+ summary_file_path = os.path.join(os.path.dirname(file_path), base_name + '_summary.txt')
456
+ os.makedirs(os.path.dirname(summary_file_path), exist_ok=True)
457
+ logging.debug("Opening summary file for writing, *segments.json with *_summary.txt")
458
+ with open(summary_file_path, 'w') as file:
459
+ file.write(summary)
460
+ logging.info(f"Summary saved to file: {summary_file_path}")
461
+
462
+ #
463
+ #
464
+ #######################################################################################################################
465
+
466
+
467
+
App_Function_Libraries/Markdown_Export-improvement.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import logging
3
+ import sqlite3
4
+ from typing import List, Dict
5
+ import os
6
+ import zipfile
7
+ import tempfile
8
+ import shutil
9
+
10
+
11
+ # Set up logging
12
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
13
+ logger = logging.getLogger(__name__)
14
+
15
+ # Database connection (you'll need to set this up)
16
+ db = None # Replace with your actual database connection
17
+
18
+
19
+ class DatabaseError(Exception):
20
+ pass
21
+
22
+
23
+ # Database functions
24
+ def fetch_items_by_keyword(search_query: str) -> List[Dict]:
25
+ try:
26
+ with db.get_connection() as conn:
27
+ cursor = conn.cursor()
28
+ cursor.execute("""
29
+ SELECT m.id, m.title, m.url
30
+ FROM Media m
31
+ JOIN MediaKeywords mk ON m.id = mk.media_id
32
+ JOIN Keywords k ON mk.keyword_id = k.id
33
+ WHERE k.keyword LIKE ?
34
+ """, (f'%{search_query}%',))
35
+ results = cursor.fetchall()
36
+ return [{"id": r[0], "title": r[1], "url": r[2]} for r in results]
37
+ except sqlite3.Error as e:
38
+ logger.error(f"Error fetching items by keyword: {e}")
39
+ raise DatabaseError(f"Error fetching items by keyword: {e}")
40
+
41
+
42
+ def fetch_item_details(media_id: int) -> tuple:
43
+ try:
44
+ with db.get_connection() as conn:
45
+ cursor = conn.cursor()
46
+ cursor.execute("""
47
+ SELECT prompt, summary
48
+ FROM MediaModifications
49
+ WHERE media_id = ?
50
+ ORDER BY modification_date DESC
51
+ LIMIT 1
52
+ """, (media_id,))
53
+ prompt_summary_result = cursor.fetchone()
54
+ cursor.execute("SELECT content FROM Media WHERE id = ?", (media_id,))
55
+ content_result = cursor.fetchone()
56
+
57
+ prompt = prompt_summary_result[0] if prompt_summary_result else ""
58
+ summary = prompt_summary_result[1] if prompt_summary_result else ""
59
+ content = content_result[0] if content_result else ""
60
+
61
+ return content, prompt, summary
62
+ except sqlite3.Error as e:
63
+ logger.error(f"Error fetching item details: {e}")
64
+ return "", "", ""
65
+
66
+
67
+ def browse_items(search_query: str, search_type: str) -> List[Dict]:
68
+ try:
69
+ with db.get_connection() as conn:
70
+ cursor = conn.cursor()
71
+ if search_type == 'Title':
72
+ cursor.execute("SELECT id, title, url FROM Media WHERE title LIKE ?", (f'%{search_query}%',))
73
+ elif search_type == 'URL':
74
+ cursor.execute("SELECT id, title, url FROM Media WHERE url LIKE ?", (f'%{search_query}%',))
75
+ elif search_type == 'Keyword':
76
+ return fetch_items_by_keyword(search_query)
77
+ elif search_type == 'Content':
78
+ cursor.execute("SELECT id, title, url FROM Media WHERE content LIKE ?", (f'%{search_query}%',))
79
+ else:
80
+ raise ValueError(f"Invalid search type: {search_type}")
81
+
82
+ results = cursor.fetchall()
83
+ return [{"id": r[0], "title": r[1], "url": r[2]} for r in results]
84
+ except sqlite3.Error as e:
85
+ logger.error(f"Error fetching items by {search_type}: {e}")
86
+ raise DatabaseError(f"Error fetching items by {search_type}: {e}")
87
+
88
+
89
+ # Export functions
90
+ def export_item_as_markdown(media_id: int) -> str:
91
+ try:
92
+ content, prompt, summary = fetch_item_details(media_id)
93
+ title = f"Item {media_id}" # You might want to fetch the actual title
94
+ markdown_content = f"# {title}\n\n## Prompt\n{prompt}\n\n## Summary\n{summary}\n\n## Content\n{content}"
95
+
96
+ filename = f"export_item_{media_id}.md"
97
+ with open(filename, "w", encoding='utf-8') as f:
98
+ f.write(markdown_content)
99
+
100
+ logger.info(f"Successfully exported item {media_id} to {filename}")
101
+ return filename
102
+ except Exception as e:
103
+ logger.error(f"Error exporting item {media_id}: {str(e)}")
104
+ return None
105
+
106
+
107
+ def export_items_by_keyword(keyword: str) -> str:
108
+ try:
109
+ items = fetch_items_by_keyword(keyword)
110
+ if not items:
111
+ logger.warning(f"No items found for keyword: {keyword}")
112
+ return None
113
+
114
+ # Create a temporary directory to store individual markdown files
115
+ with tempfile.TemporaryDirectory() as temp_dir:
116
+ folder_name = f"export_keyword_{keyword}"
117
+ export_folder = os.path.join(temp_dir, folder_name)
118
+ os.makedirs(export_folder)
119
+
120
+ for item in items:
121
+ content, prompt, summary = fetch_item_details(item['id'])
122
+ markdown_content = f"# {item['title']}\n\n## Prompt\n{prompt}\n\n## Summary\n{summary}\n\n## Content\n{content}"
123
+
124
+ # Create individual markdown file for each item
125
+ file_name = f"{item['id']}_{item['title'][:50]}.md" # Limit filename length
126
+ file_path = os.path.join(export_folder, file_name)
127
+ with open(file_path, "w", encoding='utf-8') as f:
128
+ f.write(markdown_content)
129
+
130
+ # Create a zip file containing all markdown files
131
+ zip_filename = f"{folder_name}.zip"
132
+ shutil.make_archive(os.path.join(temp_dir, folder_name), 'zip', export_folder)
133
+
134
+ # Move the zip file to a location accessible by Gradio
135
+ final_zip_path = os.path.join(os.getcwd(), zip_filename)
136
+ shutil.move(os.path.join(temp_dir, zip_filename), final_zip_path)
137
+
138
+ logger.info(f"Successfully exported {len(items)} items for keyword '{keyword}' to {zip_filename}")
139
+ return final_zip_path
140
+ except Exception as e:
141
+ logger.error(f"Error exporting items for keyword '{keyword}': {str(e)}")
142
+ return None
143
+
144
+
145
+ def export_selected_items(selected_items: List[Dict]) -> str:
146
+ try:
147
+ if not selected_items:
148
+ logger.warning("No items selected for export")
149
+ return None
150
+
151
+ markdown_content = "# Selected Items\n\n"
152
+ for item in selected_items:
153
+ content, prompt, summary = fetch_item_details(item['id'])
154
+ markdown_content += f"## {item['title']}\n\n### Prompt\n{prompt}\n\n### Summary\n{summary}\n\n### Content\n{content}\n\n---\n\n"
155
+
156
+ filename = "export_selected_items.md"
157
+ with open(filename, "w", encoding='utf-8') as f:
158
+ f.write(markdown_content)
159
+
160
+ logger.info(f"Successfully exported {len(selected_items)} selected items to {filename}")
161
+ return filename
162
+ except Exception as e:
163
+ logger.error(f"Error exporting selected items: {str(e)}")
164
+ return None
165
+
166
+
167
+ # Gradio interface functions
168
+ def display_search_results(search_query: str, search_type: str) -> List[Dict]:
169
+ try:
170
+ results = browse_items(search_query, search_type)
171
+ return [{"name": f"{item['title']} ({item['url']})", "value": item} for item in results]
172
+ except DatabaseError as e:
173
+ logger.error(f"Error in display_search_results: {str(e)}")
174
+ return []
175
+
176
+
177
+ # Gradio interface
178
+ with gr.Blocks() as demo:
179
+ gr.Markdown("# Content Export Interface")
180
+
181
+ with gr.Tab("Search and Export"):
182
+ search_query = gr.Textbox(label="Search Query")
183
+ search_type = gr.Radio(["Title", "URL", "Keyword", "Content"], label="Search By")
184
+ search_button = gr.Button("Search")
185
+
186
+ search_results = gr.CheckboxGroup(label="Search Results")
187
+ export_selected_button = gr.Button("Export Selected Items")
188
+
189
+ keyword_input = gr.Textbox(label="Enter keyword for export")
190
+ export_by_keyword_button = gr.Button("Export items by keyword")
191
+
192
+ export_output = gr.File(label="Download Exported File")
193
+
194
+ error_output = gr.Textbox(label="Status/Error Messages", interactive=False)
195
+
196
+ search_button.click(
197
+ fn=display_search_results,
198
+ inputs=[search_query, search_type],
199
+ outputs=[search_results, error_output]
200
+ )
201
+
202
+ export_selected_button.click(
203
+ fn=lambda selected: (export_selected_items(selected), "Exported selected items") if selected else (
204
+ None, "No items selected"),
205
+ inputs=[search_results],
206
+ outputs=[export_output, error_output]
207
+ )
208
+
209
+ export_by_keyword_button.click(
210
+ fn=lambda keyword: (
211
+ export_items_by_keyword(keyword), f"Exported items for keyword: {keyword}") if keyword else (
212
+ None, "No keyword provided"),
213
+ inputs=[keyword_input],
214
+ outputs=[export_output, error_output]
215
+ )
216
+
217
+ # Add functionality to export individual items
218
+ search_results.select(
219
+ fn=lambda item: (export_item_as_markdown(item['id']), f"Exported item: {item['title']}") if item else (
220
+ None, "No item selected"),
221
+ inputs=[gr.State(lambda: search_results.value)],
222
+ outputs=[export_output, error_output]
223
+ )
224
+
225
+ demo.launch()
226
+
227
+
228
+ # This modified version of export_items_by_keyword does the following:
229
+ #
230
+ # Creates a temporary directory to store individual markdown files.
231
+ # For each item associated with the keyword, it creates a separate markdown file.
232
+ # Places all markdown files in a folder named export_keyword_{keyword}.
233
+ # Creates a zip file containing the folder with all markdown files.
234
+ # Moves the zip file to a location accessible by Gradio for download.
App_Function_Libraries/Obsidian-Importer.py ADDED
@@ -0,0 +1,210 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import yaml
4
+ import sqlite3
5
+ import traceback
6
+ import time
7
+ import zipfile
8
+ import tempfile
9
+ import shutil
10
+ import gradio as gr
11
+ import logging
12
+
13
+ # Set up logging
14
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
15
+ logger = logging.getLogger(__name__)
16
+
17
+ # Assume db connection is set up elsewhere
18
+ db = None # Replace with your actual database connection
19
+
20
+
21
+ class DatabaseError(Exception):
22
+ pass
23
+
24
+
25
+ def scan_obsidian_vault(vault_path):
26
+ markdown_files = []
27
+ for root, dirs, files in os.walk(vault_path):
28
+ for file in files:
29
+ if file.endswith('.md'):
30
+ markdown_files.append(os.path.join(root, file))
31
+ return markdown_files
32
+
33
+
34
+ def parse_obsidian_note(file_path):
35
+ with open(file_path, 'r', encoding='utf-8') as file:
36
+ content = file.read()
37
+
38
+ frontmatter = {}
39
+ frontmatter_match = re.match(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
40
+ if frontmatter_match:
41
+ frontmatter_text = frontmatter_match.group(1)
42
+ frontmatter = yaml.safe_load(frontmatter_text)
43
+ content = content[frontmatter_match.end():]
44
+
45
+ tags = re.findall(r'#(\w+)', content)
46
+ links = re.findall(r'\[\[(.*?)\]\]', content)
47
+
48
+ return {
49
+ 'title': os.path.basename(file_path).replace('.md', ''),
50
+ 'content': content,
51
+ 'frontmatter': frontmatter,
52
+ 'tags': tags,
53
+ 'links': links,
54
+ 'file_path': file_path # Add this line
55
+ }
56
+
57
+
58
+ def import_obsidian_note_to_db(note_data):
59
+ try:
60
+ with db.get_connection() as conn:
61
+ cursor = conn.cursor()
62
+
63
+ cursor.execute("SELECT id FROM Media WHERE title = ? AND type = 'obsidian_note'", (note_data['title'],))
64
+ existing_note = cursor.fetchone()
65
+
66
+ if existing_note:
67
+ media_id = existing_note[0]
68
+ cursor.execute("""
69
+ UPDATE Media
70
+ SET content = ?, author = ?, ingestion_date = CURRENT_TIMESTAMP
71
+ WHERE id = ?
72
+ """, (note_data['content'], note_data['frontmatter'].get('author', 'Unknown'), media_id))
73
+
74
+ cursor.execute("DELETE FROM MediaKeywords WHERE media_id = ?", (media_id,))
75
+ else:
76
+ cursor.execute("""
77
+ INSERT INTO Media (title, content, type, author, ingestion_date, url)
78
+ VALUES (?, ?, 'obsidian_note', ?, CURRENT_TIMESTAMP, ?)
79
+ """, (note_data['title'], note_data['content'], note_data['frontmatter'].get('author', 'Unknown'),
80
+ note_data['file_path']))
81
+
82
+ media_id = cursor.lastrowid
83
+
84
+ for tag in note_data['tags']:
85
+ cursor.execute("INSERT OR IGNORE INTO Keywords (keyword) VALUES (?)", (tag,))
86
+ cursor.execute("SELECT id FROM Keywords WHERE keyword = ?", (tag,))
87
+ keyword_id = cursor.fetchone()[0]
88
+ cursor.execute("INSERT OR IGNORE INTO MediaKeywords (media_id, keyword_id) VALUES (?, ?)",
89
+ (media_id, keyword_id))
90
+
91
+ frontmatter_str = yaml.dump(note_data['frontmatter'])
92
+ cursor.execute("""
93
+ INSERT INTO MediaModifications (media_id, prompt, summary, modification_date)
94
+ VALUES (?, 'Obsidian Frontmatter', ?, CURRENT_TIMESTAMP)
95
+ """, (media_id, frontmatter_str))
96
+
97
+ # Update full-text search index
98
+ cursor.execute('INSERT OR REPLACE INTO media_fts (rowid, title, content) VALUES (?, ?, ?)',
99
+ (media_id, note_data['title'], note_data['content']))
100
+
101
+ action = "Updated" if existing_note else "Imported"
102
+ logger.info(f"{action} Obsidian note: {note_data['title']}")
103
+ return True, None
104
+ except sqlite3.Error as e:
105
+ error_msg = f"Database error {'updating' if existing_note else 'importing'} note {note_data['title']}: {str(e)}"
106
+ logger.error(error_msg)
107
+ return False, error_msg
108
+ except Exception as e:
109
+ error_msg = f"Unexpected error {'updating' if existing_note else 'importing'} note {note_data['title']}: {str(e)}\n{traceback.format_exc()}"
110
+ logger.error(error_msg)
111
+ return False, error_msg
112
+
113
+
114
+ def import_obsidian_vault(vault_path, progress=gr.Progress()):
115
+ try:
116
+ markdown_files = scan_obsidian_vault(vault_path)
117
+ total_files = len(markdown_files)
118
+ imported_files = 0
119
+ errors = []
120
+
121
+ for i, file_path in enumerate(markdown_files):
122
+ try:
123
+ note_data = parse_obsidian_note(file_path)
124
+ success, error_msg = import_obsidian_note_to_db(note_data)
125
+ if success:
126
+ imported_files += 1
127
+ else:
128
+ errors.append(error_msg)
129
+ except Exception as e:
130
+ error_msg = f"Error processing {file_path}: {str(e)}"
131
+ logger.error(error_msg)
132
+ errors.append(error_msg)
133
+
134
+ progress((i + 1) / total_files, f"Imported {imported_files} of {total_files} files")
135
+ time.sleep(0.1) # Small delay to prevent UI freezing
136
+
137
+ return imported_files, total_files, errors
138
+ except Exception as e:
139
+ error_msg = f"Error scanning vault: {str(e)}\n{traceback.format_exc()}"
140
+ logger.error(error_msg)
141
+ return 0, 0, [error_msg]
142
+
143
+
144
+ def process_obsidian_zip(zip_file):
145
+ with tempfile.TemporaryDirectory() as temp_dir:
146
+ try:
147
+ with zipfile.ZipFile(zip_file, 'r') as zip_ref:
148
+ zip_ref.extractall(temp_dir)
149
+
150
+ imported_files, total_files, errors = import_obsidian_vault(temp_dir)
151
+
152
+ return imported_files, total_files, errors
153
+ except zipfile.BadZipFile:
154
+ error_msg = "The uploaded file is not a valid zip file."
155
+ logger.error(error_msg)
156
+ return 0, 0, [error_msg]
157
+ except Exception as e:
158
+ error_msg = f"Error processing zip file: {str(e)}\n{traceback.format_exc()}"
159
+ logger.error(error_msg)
160
+ return 0, 0, [error_msg]
161
+ finally:
162
+ shutil.rmtree(temp_dir, ignore_errors=True)
163
+
164
+
165
+ # Gradio interface
166
+ with gr.Blocks() as demo:
167
+ gr.Markdown("# Content Export and Import Interface")
168
+
169
+ # ... (your existing tabs and components)
170
+
171
+ with gr.Tab("Import Obsidian Vault"):
172
+ gr.Markdown("## Import Obsidian Vault")
173
+ with gr.Row():
174
+ vault_path_input = gr.Textbox(label="Obsidian Vault Path (Local)")
175
+ vault_zip_input = gr.File(label="Upload Obsidian Vault (Zip)")
176
+ import_vault_button = gr.Button("Import Obsidian Vault")
177
+ import_status = gr.Textbox(label="Import Status", interactive=False)
178
+
179
+
180
+ def import_vault(vault_path, vault_zip):
181
+ if vault_zip:
182
+ imported, total, errors = process_obsidian_zip(vault_zip.name)
183
+ elif vault_path:
184
+ imported, total, errors = import_obsidian_vault(vault_path)
185
+ else:
186
+ return "Please provide either a local vault path or upload a zip file."
187
+
188
+ status = f"Imported {imported} out of {total} files.\n"
189
+ if errors:
190
+ status += f"Encountered {len(errors)} errors:\n" + "\n".join(errors)
191
+ return status
192
+
193
+
194
+ import_vault_button.click(
195
+ fn=import_vault,
196
+ inputs=[vault_path_input, vault_zip_input],
197
+ outputs=[import_status],
198
+ show_progress=True
199
+ )
200
+
201
+ # ... (rest of your existing code)
202
+
203
+ demo.launch()
204
+
205
+ # This comprehensive solution includes:
206
+ #
207
+ # Enhanced error handling throughout the import process.
208
+ # Progress updates for large vaults using Gradio's progress bar.
209
+ # The ability to update existing notes if they're reimported.
210
+ # Support for importing Obsidian vaults from both local directories and uploaded zip files.
App_Function_Libraries/Old_Chunking_Lib.py ADDED
@@ -0,0 +1,159 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Old_Chunking_Lib.py
2
+ #########################################
3
+ # Old Chunking Library
4
+ # This library is used to handle chunking of text for summarization.
5
+ #
6
+ ####
7
+ import logging
8
+ ####################
9
+ # Function List
10
+ #
11
+ # 1. chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str]
12
+ # 2. summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int, words_per_second: int) -> str
13
+ # 3. get_chat_completion(messages, model='gpt-4-turbo')
14
+ # 4. chunk_on_delimiter(input_string: str, max_tokens: int, delimiter: str) -> List[str]
15
+ # 5. combine_chunks_with_no_minimum(chunks: List[str], max_tokens: int, chunk_delimiter="\n\n", header: Optional[str] = None, add_ellipsis_for_overflow=False) -> Tuple[List[str], List[int]]
16
+ # 6. rolling_summarize(text: str, detail: float = 0, model: str = 'gpt-4-turbo', additional_instructions: Optional[str] = None, minimum_chunk_size: Optional[int] = 500, chunk_delimiter: str = ".", summarize_recursively=False, verbose=False)
17
+ # 7. chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str]
18
+ # 8. summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int, words_per_second: int) -> str
19
+ #
20
+ ####################
21
+
22
+ # Import necessary libraries
23
+ import os
24
+ from typing import Optional, List, Tuple
25
+ #
26
+ # Import 3rd party
27
+ from openai import OpenAI
28
+ from App_Function_Libraries.Tokenization_Methods_Lib import openai_tokenize
29
+ #
30
+ # Import Local
31
+ #
32
+ #######################################################################################################################
33
+ # Function Definitions
34
+ #
35
+
36
+ ######### Words-per-second Chunking #########
37
+ def chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str]:
38
+ words = transcript.split()
39
+ words_per_chunk = chunk_duration * words_per_second
40
+ chunks = [' '.join(words[i:i + words_per_chunk]) for i in range(0, len(words), words_per_chunk)]
41
+ return chunks
42
+
43
+
44
+ # def summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int,
45
+ # words_per_second: int) -> str:
46
+ # if api_name not in summarizers: # See 'summarizers' dict in the main script
47
+ # return f"Unsupported API: {api_name}"
48
+ #
49
+ # summarizer = summarizers[api_name]
50
+ # text = extract_text_from_segments(transcript)
51
+ # chunks = chunk_transcript(text, chunk_duration, words_per_second)
52
+ #
53
+ # summaries = []
54
+ # for chunk in chunks:
55
+ # if api_name == 'openai':
56
+ # # Ensure the correct model and prompt are passed
57
+ # summaries.append(summarizer(api_key, chunk, custom_prompt))
58
+ # else:
59
+ # summaries.append(summarizer(api_key, chunk))
60
+ #
61
+ # return "\n\n".join(summaries)
62
+
63
+
64
+ ################## ####################
65
+
66
+
67
+ ######### Token-size Chunking ######### FIXME - OpenAI only currently
68
+ # This is dirty and shameful and terrible. It should be replaced with a proper implementation.
69
+ # anyways lets get to it....
70
+ openai_api_key = "Fake_key" # FIXME
71
+ client = OpenAI(api_key=openai_api_key)
72
+
73
+
74
+
75
+
76
+
77
+ # This function chunks a text into smaller pieces based on a maximum token count and a delimiter
78
+ def chunk_on_delimiter(input_string: str,
79
+ max_tokens: int,
80
+ delimiter: str) -> List[str]:
81
+ chunks = input_string.split(delimiter)
82
+ combined_chunks, _, dropped_chunk_count = combine_chunks_with_no_minimum(
83
+ chunks, max_tokens, chunk_delimiter=delimiter, add_ellipsis_for_overflow=True)
84
+ if dropped_chunk_count > 0:
85
+ print(f"Warning: {dropped_chunk_count} chunks were dropped due to exceeding the token limit.")
86
+ combined_chunks = [f"{chunk}{delimiter}" for chunk in combined_chunks]
87
+ return combined_chunks
88
+
89
+
90
+
91
+
92
+
93
+ #######################################
94
+
95
+
96
+ ######### Words-per-second Chunking #########
97
+ # FIXME - WHole section needs to be re-written
98
+ def chunk_transcript(transcript: str, chunk_duration: int, words_per_second) -> List[str]:
99
+ words = transcript.split()
100
+ words_per_chunk = chunk_duration * words_per_second
101
+ chunks = [' '.join(words[i:i + words_per_chunk]) for i in range(0, len(words), words_per_chunk)]
102
+ return chunks
103
+
104
+
105
+ # def summarize_chunks(api_name: str, api_key: str, transcript: List[dict], chunk_duration: int,
106
+ # words_per_second: int) -> str:
107
+ # if api_name not in summarizers: # See 'summarizers' dict in the main script
108
+ # return f"Unsupported API: {api_name}"
109
+ #
110
+ # if not transcript:
111
+ # logging.error("Empty or None transcript provided to summarize_chunks")
112
+ # return "Error: Empty or None transcript provided"
113
+ #
114
+ # text = extract_text_from_segments(transcript)
115
+ # chunks = chunk_transcript(text, chunk_duration, words_per_second)
116
+ #
117
+ # #FIXME
118
+ # custom_prompt = args.custom_prompt
119
+ #
120
+ # summaries = []
121
+ # for chunk in chunks:
122
+ # if api_name == 'openai':
123
+ # # Ensure the correct model and prompt are passed
124
+ # summaries.append(summarize_with_openai(api_key, chunk, custom_prompt))
125
+ # elif api_name == 'anthropic':
126
+ # summaries.append(summarize_with_cohere(api_key, chunk, anthropic_model, custom_prompt))
127
+ # elif api_name == 'cohere':
128
+ # summaries.append(summarize_with_anthropic(api_key, chunk, cohere_model, custom_prompt))
129
+ # elif api_name == 'groq':
130
+ # summaries.append(summarize_with_groq(api_key, chunk, groq_model, custom_prompt))
131
+ # elif api_name == 'llama':
132
+ # summaries.append(summarize_with_llama(llama_api_IP, chunk, api_key, custom_prompt))
133
+ # elif api_name == 'kobold':
134
+ # summaries.append(summarize_with_kobold(kobold_api_IP, chunk, api_key, custom_prompt))
135
+ # elif api_name == 'ooba':
136
+ # summaries.append(summarize_with_oobabooga(ooba_api_IP, chunk, api_key, custom_prompt))
137
+ # elif api_name == 'tabbyapi':
138
+ # summaries.append(summarize_with_vllm(api_key, tabby_api_IP, chunk, summarize.llm_model, custom_prompt))
139
+ # elif api_name == 'local-llm':
140
+ # summaries.append(summarize_with_local_llm(chunk, custom_prompt))
141
+ # else:
142
+ # return f"Unsupported API: {api_name}"
143
+ #
144
+ # return "\n\n".join(summaries)
145
+
146
+ # FIXME - WHole section needs to be re-written
147
+ def summarize_with_detail_openai(text, detail, verbose=False):
148
+ summary_with_detail_variable = rolling_summarize(text, detail=detail, verbose=True)
149
+ print(len(openai_tokenize(summary_with_detail_variable)))
150
+ return summary_with_detail_variable
151
+
152
+
153
+ def summarize_with_detail_recursive_openai(text, detail, verbose=False):
154
+ summary_with_recursive_summarization = rolling_summarize(text, detail=detail, summarize_recursively=True)
155
+ print(summary_with_recursive_summarization)
156
+
157
+ #
158
+ #
159
+ #################################################################################
App_Function_Libraries/PDF_Ingestion_Lib.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PDF_Ingestion_Lib.py
2
+ #########################################
3
+ # Library to hold functions for ingesting PDF files.#
4
+ #
5
+ ####################
6
+ # Function List
7
+ #
8
+ # 1. convert_pdf_to_markdown(pdf_path)
9
+ # 2. ingest_pdf_file(file_path, title=None, author=None, keywords=None):
10
+ # 3.
11
+ #
12
+ #
13
+ ####################
14
+
15
+
16
+ # Import necessary libraries
17
+ from datetime import datetime
18
+ import logging
19
+ import subprocess
20
+ import os
21
+ import shutil
22
+ import tempfile
23
+
24
+
25
+ # Import Local
26
+ from App_Function_Libraries.SQLite_DB import add_media_with_keywords
27
+
28
+ #######################################################################################################################
29
+ # Function Definitions
30
+ #
31
+
32
+ # Ingest a text file into the database with Title/Author/Keywords
33
+
34
+
35
+ # Constants
36
+ MAX_FILE_SIZE_MB = 50
37
+ CONVERSION_TIMEOUT_SECONDS = 300
38
+
39
+
40
+ def convert_pdf_to_markdown(pdf_path):
41
+ """
42
+ Convert a PDF file to Markdown by calling a script in another virtual environment.
43
+ """
44
+
45
+ logging.debug(f"Marker: Converting PDF file to Markdown: {pdf_path}")
46
+ # Check if the file size exceeds the maximum allowed size
47
+ file_size_mb = os.path.getsize(pdf_path) / (1024 * 1024)
48
+ if file_size_mb > MAX_FILE_SIZE_MB:
49
+ raise ValueError(f"File size ({file_size_mb:.2f} MB) exceeds the maximum allowed size of {MAX_FILE_SIZE_MB} MB")
50
+
51
+ logging.debug("Marker: Converting PDF file to Markdown using Marker virtual environment")
52
+ # Path to the Python interpreter in the other virtual environment
53
+ other_venv_python = "Helper_Scripts/marker_venv/bin/python"
54
+
55
+ # Path to the conversion script
56
+ converter_script = "Helper_Scripts/PDF_Converter.py"
57
+
58
+ logging.debug("Marker: Attempting to convert PDF file to Markdown...")
59
+ try:
60
+ result = subprocess.run(
61
+ [other_venv_python, converter_script, pdf_path],
62
+ capture_output=True,
63
+ text=True,
64
+ timeout=CONVERSION_TIMEOUT_SECONDS
65
+ )
66
+ if result.returncode != 0:
67
+ raise Exception(f"Conversion failed: {result.stderr}")
68
+ return result.stdout
69
+ except subprocess.TimeoutExpired:
70
+ raise Exception(f"PDF conversion timed out after {CONVERSION_TIMEOUT_SECONDS} seconds")
71
+
72
+
73
+ def process_and_ingest_pdf(file, title, author, keywords):
74
+ if file is None:
75
+ return "Please select a PDF file to upload."
76
+
77
+ try:
78
+ # Create a temporary directory
79
+ with tempfile.TemporaryDirectory() as temp_dir:
80
+ # Create a path for the temporary PDF file
81
+ temp_path = os.path.join(temp_dir, "temp.pdf")
82
+
83
+ # Copy the contents of the uploaded file to the temporary file
84
+ shutil.copy(file.name, temp_path)
85
+
86
+ # Call the ingest_pdf_file function with the temporary file path
87
+ result = ingest_pdf_file(temp_path, title, author, keywords)
88
+
89
+ return result
90
+ except Exception as e:
91
+ return f"Error processing PDF: {str(e)}"
92
+
93
+
94
+ def ingest_pdf_file(file_path, title=None, author=None, keywords=None):
95
+ try:
96
+ # Convert PDF to Markdown
97
+ markdown_content = convert_pdf_to_markdown(file_path)
98
+
99
+ # If title is not provided, use the filename without extension
100
+ if not title:
101
+ title = os.path.splitext(os.path.basename(file_path))[0]
102
+
103
+ # If author is not provided, set it to 'Unknown'
104
+ if not author:
105
+ author = 'Unknown'
106
+
107
+ # If keywords are not provided, use a default keyword
108
+ if not keywords:
109
+ keywords = 'pdf_file,markdown_converted'
110
+ else:
111
+ keywords = f'pdf_file,markdown_converted,{keywords}'
112
+
113
+ # Add the markdown content to the database
114
+ add_media_with_keywords(
115
+ url=file_path,
116
+ title=title,
117
+ media_type='document',
118
+ content=markdown_content,
119
+ keywords=keywords,
120
+ prompt='No prompt for PDF files',
121
+ summary='No summary for PDF files',
122
+ transcription_model='None',
123
+ author=author,
124
+ ingestion_date=datetime.now().strftime('%Y-%m-%d')
125
+ )
126
+
127
+ return f"PDF file '{title}' converted to Markdown and ingested successfully.", file_path
128
+ except ValueError as e:
129
+ logging.error(f"File size error: {str(e)}")
130
+ return f"Error: {str(e)}", file_path
131
+ except Exception as e:
132
+ logging.error(f"Error ingesting PDF file: {str(e)}")
133
+ return f"Error ingesting PDF file: {str(e)}", file_path
134
+
135
+
136
+ def process_and_cleanup_pdf(file, title, author, keywords):
137
+ if file is None:
138
+ return "No file uploaded. Please upload a PDF file."
139
+
140
+ temp_dir = tempfile.mkdtemp()
141
+ temp_file_path = os.path.join(temp_dir, "temp.pdf")
142
+
143
+ try:
144
+ # Copy the uploaded file to a temporary location
145
+ shutil.copy2(file.name, temp_file_path)
146
+
147
+ # Process the file
148
+ result, _ = ingest_pdf_file(temp_file_path, title, author, keywords)
149
+
150
+ return result
151
+ except Exception as e:
152
+ logging.error(f"Error in processing and cleanup: {str(e)}")
153
+ return f"Error: {str(e)}"
154
+ finally:
155
+ # Clean up the temporary directory and its contents
156
+ try:
157
+ shutil.rmtree(temp_dir)
158
+ logging.info(f"Removed temporary directory: {temp_dir}")
159
+ except Exception as cleanup_error:
160
+ logging.error(f"Error during cleanup: {str(cleanup_error)}")
161
+ result += f"\nWarning: Could not remove temporary files: {str(cleanup_error)}"
162
+
163
+
164
+ #
165
+ #
166
+ #######################################################################################################################
App_Function_Libraries/RAG_Library.py ADDED
@@ -0,0 +1,812 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # RAG_Library.py
2
+ #########################################
3
+ # RAG Search & Related Functions Library
4
+ # This library is used to hold any/all RAG-related operations.
5
+ # Currently, all of this code was generated from Sonnet 3.5. 0_0
6
+ #
7
+ ####
8
+
9
+ import os
10
+ from typing import List, Tuple, Callable, Optional
11
+ from contextlib import contextmanager
12
+ import sqlite3
13
+ import numpy as np
14
+ from sentence_transformers import SentenceTransformer
15
+ from sklearn.metrics.pairwise import cosine_similarity
16
+ import logging
17
+ from dotenv import load_dotenv
18
+
19
+ load_dotenv()
20
+
21
+ logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class RAGException(Exception):
26
+ """Custom exception class for RAG-related errors"""
27
+ pass
28
+
29
+
30
+ class BaseRAGSystem:
31
+ def __init__(self, db_path: str, model_name: Optional[str] = None):
32
+ """
33
+ Initialize the RAG system.
34
+
35
+ :param db_path: Path to the SQLite database
36
+ :param model_name: Name of the SentenceTransformer model to use
37
+ """
38
+ self.db_path = db_path
39
+ self.model_name = model_name or os.getenv('DEFAULT_MODEL_NAME', 'all-MiniLM-L6-v2')
40
+ try:
41
+ self.model = SentenceTransformer(self.model_name)
42
+ logger.info(f"Initialized SentenceTransformer with model: {self.model_name}")
43
+ except Exception as e:
44
+ logger.error(f"Failed to initialize SentenceTransformer: {e}")
45
+ raise RAGException(f"Model initialization failed: {e}")
46
+
47
+ self.init_db()
48
+
49
+ @contextmanager
50
+ def get_db_connection(self):
51
+ conn = sqlite3.connect(self.db_path)
52
+ try:
53
+ yield conn
54
+ finally:
55
+ conn.close()
56
+
57
+ def init_db(self):
58
+ try:
59
+ with self.get_db_connection() as conn:
60
+ cursor = conn.cursor()
61
+ cursor.execute('''
62
+ CREATE TABLE IF NOT EXISTS documents (
63
+ id INTEGER PRIMARY KEY,
64
+ title TEXT,
65
+ content TEXT,
66
+ embedding BLOB
67
+ )
68
+ ''')
69
+ conn.commit()
70
+ logger.info("Initialized database schema")
71
+ except sqlite3.Error as e:
72
+ logger.error(f"Failed to initialize database schema: {e}")
73
+ raise RAGException(f"Database schema initialization failed: {e}")
74
+
75
+ def add_documents(self, documents: List[Tuple[str, str]]):
76
+ try:
77
+ embeddings = self.model.encode([content for _, content in documents])
78
+ with self.get_db_connection() as conn:
79
+ cursor = conn.cursor()
80
+ cursor.executemany(
81
+ 'INSERT INTO documents (title, content, embedding) VALUES (?, ?, ?)',
82
+ [(title, content, embedding.tobytes()) for (title, content), embedding in zip(documents, embeddings)]
83
+ )
84
+ conn.commit()
85
+ logger.info(f"Added {len(documents)} documents in batch")
86
+ except Exception as e:
87
+ logger.error(f"Failed to add documents in batch: {e}")
88
+ raise RAGException(f"Batch document addition failed: {e}")
89
+
90
+ def get_documents(self) -> List[Tuple[int, str, str, np.ndarray]]:
91
+ try:
92
+ with self.get_db_connection() as conn:
93
+ cursor = conn.cursor()
94
+ cursor.execute('SELECT id, title, content, embedding FROM documents')
95
+ documents = [(id, title, content, np.frombuffer(embedding, dtype=np.float32))
96
+ for id, title, content, embedding in cursor.fetchall()]
97
+ logger.info(f"Retrieved {len(documents)} documents")
98
+ return documents
99
+ except sqlite3.Error as e:
100
+ logger.error(f"Failed to retrieve documents: {e}")
101
+ raise RAGException(f"Document retrieval failed: {e}")
102
+
103
+ def close(self):
104
+ try:
105
+ self.conn.close()
106
+ logger.info("Closed database connection")
107
+ except sqlite3.Error as e:
108
+ logger.error(f"Error closing database connection: {e}")
109
+
110
+
111
+ class StandardRAGSystem(BaseRAGSystem):
112
+ def get_relevant_documents(self, query: str, top_k: int = 3) -> List[Tuple[int, str, str, float]]:
113
+ try:
114
+ query_embedding = self.model.encode([query])[0]
115
+ documents = self.get_documents()
116
+ similarities = [
117
+ (id, title, content, cosine_similarity([query_embedding], [doc_embedding])[0][0])
118
+ for id, title, content, doc_embedding in documents
119
+ ]
120
+ similarities.sort(key=lambda x: x[3], reverse=True)
121
+ logger.info(f"Retrieved top {top_k} relevant documents for query")
122
+ return similarities[:top_k]
123
+ except Exception as e:
124
+ logger.error(f"Error in getting relevant documents: {e}")
125
+ raise RAGException(f"Retrieval of relevant documents failed: {e}")
126
+
127
+ def rag_query(self, query: str, llm_function: Callable[[str], str], top_k: int = 3) -> str:
128
+ try:
129
+ relevant_docs = self.get_relevant_documents(query, top_k)
130
+ context = "\n\n".join([f"Title: {title}\nContent: {content}" for _, title, content, _ in relevant_docs])
131
+
132
+ llm_prompt = f"Based on the following context, please answer the query:\n\nContext:\n{context}\n\nQuery: {query}"
133
+
134
+ response = llm_function(llm_prompt)
135
+ logger.info("Generated response for query")
136
+ return response
137
+ except Exception as e:
138
+ logger.error(f"Error in RAG query: {e}")
139
+ raise RAGException(f"RAG query failed: {e}")
140
+
141
+
142
+ class HyDERAGSystem(BaseRAGSystem):
143
+ def generate_hypothetical_document(self, query: str, llm_function: Callable[[str], str]) -> str:
144
+ try:
145
+ prompt = f"Given the question '{query}', write a short paragraph that would answer this question. Do not include the question itself in your response."
146
+ hypothetical_doc = llm_function(prompt)
147
+ logger.info("Generated hypothetical document")
148
+ return hypothetical_doc
149
+ except Exception as e:
150
+ logger.error(f"Error generating hypothetical document: {e}")
151
+ raise RAGException(f"Hypothetical document generation failed: {e}")
152
+
153
+ def get_relevant_documents(self, query: str, llm_function: Callable[[str], str], top_k: int = 3) -> List[
154
+ Tuple[int, str, str, float]]:
155
+ try:
156
+ hypothetical_doc = self.generate_hypothetical_document(query, llm_function)
157
+ hyde_embedding = self.model.encode([hypothetical_doc])[0]
158
+
159
+ documents = self.get_documents()
160
+ similarities = [
161
+ (id, title, content, cosine_similarity([hyde_embedding], [doc_embedding])[0][0])
162
+ for id, title, content, doc_embedding in documents
163
+ ]
164
+ similarities.sort(key=lambda x: x[3], reverse=True)
165
+ logger.info(f"Retrieved top {top_k} relevant documents using HyDE")
166
+ return similarities[:top_k]
167
+ except Exception as e:
168
+ logger.error(f"Error in getting relevant documents with HyDE: {e}")
169
+ raise RAGException(f"HyDE retrieval of relevant documents failed: {e}")
170
+
171
+ def rag_query(self, query: str, llm_function: Callable[[str], str], top_k: int = 3) -> str:
172
+ try:
173
+ relevant_docs = self.get_relevant_documents(query, llm_function, top_k)
174
+ context = "\n\n".join([f"Title: {title}\nContent: {content}" for _, title, content, _ in relevant_docs])
175
+
176
+ llm_prompt = f"Based on the following context, please answer the query:\n\nContext:\n{context}\n\nQuery: {query}"
177
+
178
+ response = llm_function(llm_prompt)
179
+ logger.info("Generated response for query using HyDE")
180
+ return response
181
+ except Exception as e:
182
+ logger.error(f"Error in HyDE RAG query: {e}")
183
+ raise RAGException(f"HyDE RAG query failed: {e}")
184
+
185
+
186
+ # Example usage with error handling
187
+ def mock_llm(prompt: str) -> str:
188
+ if "write a short paragraph" in prompt:
189
+ return "Paris, the capital of France, is renowned for its iconic Eiffel Tower and rich cultural heritage."
190
+ else:
191
+ return f"This is a mock LLM response for the prompt: {prompt}"
192
+
193
+
194
+ def main():
195
+ use_hyde = False # Set this to True when you want to enable HyDE
196
+
197
+ try:
198
+ if use_hyde:
199
+ rag_system = HyDERAGSystem('rag_database.db')
200
+ logger.info("Using HyDE RAG System")
201
+ else:
202
+ rag_system = StandardRAGSystem('rag_database.db')
203
+ logger.info("Using Standard RAG System")
204
+
205
+ # Add sample documents in batch
206
+ sample_docs = [
207
+ ("Paris", "Paris is the capital of France and is known for the Eiffel Tower."),
208
+ ("London", "London is the capital of the United Kingdom and home to Big Ben."),
209
+ ("Tokyo", "Tokyo is the capital of Japan and is famous for its bustling city life.")
210
+ ]
211
+
212
+ for title, content in sample_docs:
213
+ rag_system.add_document(title, content)
214
+
215
+ query = "What is the capital of France?"
216
+ result = rag_system.rag_query(query, mock_llm)
217
+ print(f"Query: {query}")
218
+ print(f"Result: {result}")
219
+
220
+ except RAGException as e:
221
+ logger.error(f"RAG system error: {e}")
222
+ print(f"An error occurred: {e}")
223
+ except Exception as e:
224
+ logger.error(f"Unexpected error: {e}")
225
+ print(f"An unexpected error occurred: {e}")
226
+ finally:
227
+ if 'rag_system' in locals():
228
+ rag_system.close()
229
+
230
+
231
+ if __name__ == "__main__":
232
+ main()
233
+
234
+
235
+
236
+ ####################################################################################
237
+ # async:
238
+
239
+ # import os
240
+ # import asyncio
241
+ # from typing import List, Tuple, Callable, Optional
242
+ # import aiosqlite
243
+ # import numpy as np
244
+ # from sentence_transformers import SentenceTransformer
245
+ # from sklearn.metrics.pairwise import cosine_similarity
246
+ # import logging
247
+ # from dotenv import load_dotenv
248
+ #
249
+ # load_dotenv()
250
+ #
251
+ # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
252
+ # logger = logging.getLogger(__name__)
253
+ #
254
+ #
255
+ # class RAGException(Exception):
256
+ # """Custom exception class for RAG-related errors"""
257
+ # pass
258
+ #
259
+ #
260
+ # class BaseRAGSystem:
261
+ # def __init__(self, db_path: str, model_name: Optional[str] = None):
262
+ # """
263
+ # Initialize the RAG system.
264
+ #
265
+ # :param db_path: Path to the SQLite database
266
+ # :param model_name: Name of the SentenceTransformer model to use
267
+ # """
268
+ # self.db_path = db_path
269
+ # self.model_name = model_name or os.getenv('DEFAULT_MODEL_NAME', 'all-MiniLM-L6-v2')
270
+ # try:
271
+ # self.model = SentenceTransformer(self.model_name)
272
+ # logger.info(f"Initialized SentenceTransformer with model: {self.model_name}")
273
+ # except Exception as e:
274
+ # logger.error(f"Failed to initialize SentenceTransformer: {e}")
275
+ # raise RAGException(f"Model initialization failed: {e}")
276
+ #
277
+ # async def init_db(self):
278
+ # try:
279
+ # async with aiosqlite.connect(self.db_path) as db:
280
+ # await db.execute('''
281
+ # CREATE TABLE IF NOT EXISTS documents (
282
+ # id INTEGER PRIMARY KEY,
283
+ # title TEXT,
284
+ # content TEXT,
285
+ # embedding BLOB
286
+ # )
287
+ # ''')
288
+ # await db.commit()
289
+ # logger.info("Initialized database schema")
290
+ # except aiosqlite.Error as e:
291
+ # logger.error(f"Failed to initialize database schema: {e}")
292
+ # raise RAGException(f"Database schema initialization failed: {e}")
293
+ #
294
+ # async def add_documents(self, documents: List[Tuple[str, str]]):
295
+ # try:
296
+ # embeddings = self.model.encode([content for _, content in documents])
297
+ # async with aiosqlite.connect(self.db_path) as db:
298
+ # await db.executemany(
299
+ # 'INSERT INTO documents (title, content, embedding) VALUES (?, ?, ?)',
300
+ # [(title, content, embedding.tobytes()) for (title, content), embedding in
301
+ # zip(documents, embeddings)]
302
+ # )
303
+ # await db.commit()
304
+ # logger.info(f"Added {len(documents)} documents in batch")
305
+ # except Exception as e:
306
+ # logger.error(f"Failed to add documents in batch: {e}")
307
+ # raise RAGException(f"Batch document addition failed: {e}")
308
+ #
309
+ # async def get_documents(self) -> List[Tuple[int, str, str, np.ndarray, str]]:
310
+ # try:
311
+ # async with aiosqlite.connect(self.db_path) as db:
312
+ # async with db.execute('SELECT id, title, content, embedding, source FROM documents') as cursor:
313
+ # documents = [
314
+ # (id, title, content, np.frombuffer(embedding, dtype=np.float32), source)
315
+ # async for id, title, content, embedding, source in cursor
316
+ # ]
317
+ # logger.info(f"Retrieved {len(documents)} documents")
318
+ # return documents
319
+ # except aiosqlite.Error as e:
320
+ # logger.error(f"Failed to retrieve documents: {e}")
321
+ # raise RAGException(f"Document retrieval failed: {e}")
322
+ #
323
+ #
324
+ # class AsyncStandardRAGSystem(BaseRAGSystem):
325
+ # async def get_relevant_documents(self, query: str, top_k: int = 3) -> List[Tuple[int, str, str, float]]:
326
+ # try:
327
+ # query_embedding = self.model.encode([query])[0]
328
+ # documents = await self.get_documents()
329
+ # similarities = [
330
+ # (id, title, content, cosine_similarity([query_embedding], [doc_embedding])[0][0])
331
+ # for id, title, content, doc_embedding in documents
332
+ # ]
333
+ # similarities.sort(key=lambda x: x[3], reverse=True)
334
+ # logger.info(f"Retrieved top {top_k} relevant documents for query")
335
+ # return similarities[:top_k]
336
+ # except Exception as e:
337
+ # logger.error(f"Error in getting relevant documents: {e}")
338
+ # raise RAGException(f"Retrieval of relevant documents failed: {e}")
339
+ #
340
+ # async def rag_query(self, query: str, llm_function: Callable[[str], str], top_k: int = 3) -> str:
341
+ # try:
342
+ # relevant_docs = await self.get_relevant_documents(query, top_k)
343
+ # context = "\n\n".join([f"Title: {title}\nContent: {content}\nSource: {source}" for _, title, content, _, source in relevant_docs])
344
+ #
345
+ # llm_prompt = f"Based on the following context, please answer the query. Include citations in your response using [Source] format:\n\nContext:\n{context}\n\nQuery: {query}"
346
+ #
347
+ # response = llm_function(llm_prompt)
348
+ # logger.info("Generated response for query")
349
+ # return response
350
+ # except Exception as e:
351
+ # logger.error(f"Error in RAG query: {e}")
352
+ # raise RAGException(f"RAG query failed: {e}")
353
+ #
354
+ #
355
+ # class AsyncHyDERAGSystem(BaseRAGSystem):
356
+ # async def generate_hypothetical_document(self, query: str, llm_function: Callable[[str], str]) -> str:
357
+ # try:
358
+ # prompt = f"Given the question '{query}', write a short paragraph that would answer this question. Do not include the question itself in your response."
359
+ # hypothetical_doc = llm_function(prompt)
360
+ # logger.info("Generated hypothetical document")
361
+ # return hypothetical_doc
362
+ # except Exception as e:
363
+ # logger.error(f"Error generating hypothetical document: {e}")
364
+ # raise RAGException(f"Hypothetical document generation failed: {e}")
365
+ #
366
+ # async def get_relevant_documents(self, query: str, llm_function: Callable[[str], str], top_k: int = 3) -> List[
367
+ # Tuple[int, str, str, float]]:
368
+ # try:
369
+ # hypothetical_doc = await self.generate_hypothetical_document(query, llm_function)
370
+ # hyde_embedding = self.model.encode([hypothetical_doc])[0]
371
+ #
372
+ # documents = await self.get_documents()
373
+ # similarities = [
374
+ # (id, title, content, cosine_similarity([hyde_embedding], [doc_embedding])[0][0])
375
+ # for id, title, content, doc_embedding in documents
376
+ # ]
377
+ # similarities.sort(key=lambda x: x[3], reverse=True)
378
+ # logger.info(f"Retrieved top {top_k} relevant documents using HyDE")
379
+ # return similarities[:top_k]
380
+ # except Exception as e:
381
+ # logger.error(f"Error in getting relevant documents with HyDE: {e}")
382
+ # raise RAGException(f"HyDE retrieval of relevant documents failed: {e}")
383
+ #
384
+ # async def rag_query(self, query: str, llm_function: Callable[[str], str], top_k: int = 3) -> str:
385
+ # try:
386
+ # relevant_docs = await self.get_relevant_documents(query, llm_function, top_k)
387
+ # context = "\n\n".join([f"Title: {title}\nContent: {content}" for _, title, content, _ in relevant_docs])
388
+ #
389
+ # llm_prompt = f"Based on the following context, please answer the query:\n\nContext:\n{context}\n\nQuery: {query}"
390
+ #
391
+ # response = llm_function(llm_prompt)
392
+ # logger.info("Generated response for query using HyDE")
393
+ # return response
394
+ # except Exception as e:
395
+ # logger.error(f"Error in HyDE RAG query: {e}")
396
+ # raise RAGException(f"HyDE RAG query failed: {e}")
397
+ #
398
+ #
399
+ # # Example usage with error handling
400
+ # def mock_llm(prompt: str) -> str:
401
+ # if "write a short paragraph" in prompt:
402
+ # return "Paris, the capital of France, is renowned for its iconic Eiffel Tower and rich cultural heritage."
403
+ # else:
404
+ # return f"This is a mock LLM response for the prompt: {prompt}"
405
+ #
406
+ #
407
+ # async def main():
408
+ # use_hyde = False # Set this to True when you want to enable HyDE
409
+ #
410
+ # try:
411
+ # if use_hyde:
412
+ # rag_system = AsyncHyDERAGSystem('rag_database.db')
413
+ # logger.info("Using Async HyDE RAG System")
414
+ # else:
415
+ # rag_system = AsyncStandardRAGSystem('rag_database.db')
416
+ # logger.info("Using Async Standard RAG System")
417
+ #
418
+ # await rag_system.init_db()
419
+ #
420
+ # # Add sample documents
421
+ # sample_docs = [
422
+ # ("Paris", "Paris is the capital of France and is known for the Eiffel Tower."),
423
+ # ("London", "London is the capital of the United Kingdom and home to Big Ben."),
424
+ # ("Tokyo", "Tokyo is the capital of Japan and is famous for its bustling city life.")
425
+ # ]
426
+ #
427
+ # await rag_system.add_documents(sample_docs)
428
+ #
429
+ # query = "What is the capital of France?"
430
+ # result = await rag_system.rag_query(query, mock_llm)
431
+ # print(f"Query: {query}")
432
+ # print(f"Result: {result}")
433
+ #
434
+ # except RAGException as e:
435
+ # logger.error(f"RAG system error: {e}")
436
+ # print(f"An error occurred: {e}")
437
+ # except Exception as e:
438
+ # logger.error(f"Unexpected error: {e}")
439
+ # print(f"An unexpected error occurred: {e}")
440
+ #
441
+ #
442
+ # if __name__ == "__main__":
443
+ # asyncio.run(main())
444
+
445
+
446
+
447
+ #
448
+ # from fastapi import FastAPI, HTTPException
449
+ #
450
+ # app = FastAPI()
451
+ # rag_system = AsyncStandardRAGSystem('rag_database.db')
452
+ #
453
+ # @app.on_event("startup")
454
+ # async def startup_event():
455
+ # await rag_system.init_db()
456
+ #
457
+ # @app.get("/query")
458
+ # async def query(q: str):
459
+ # try:
460
+ # result = await rag_system.rag_query(q, mock_llm)
461
+ # return {"query": q, "result": result}
462
+ # except RAGException as e:
463
+ # raise HTTPException(status_code=500, detail=str(e))
464
+ #
465
+
466
+
467
+ ############################################################################################
468
+ # Using FAISS
469
+ #
470
+ #
471
+ #
472
+ # Update DB
473
+ # async def init_db(self):
474
+ # try:
475
+ # async with aiosqlite.connect(self.db_path) as db:
476
+ # await db.execute('''
477
+ # CREATE TABLE IF NOT EXISTS documents (
478
+ # id INTEGER PRIMARY KEY,
479
+ # title TEXT,
480
+ # content TEXT,
481
+ # embedding BLOB,
482
+ # source TEXT
483
+ # )
484
+ # ''')
485
+ # await db.commit()
486
+ # logger.info("Initialized database schema")
487
+ # except aiosqlite.Error as e:
488
+ # logger.error(f"Failed to initialize database schema: {e}")
489
+ # raise RAGException(f"Database schema initialization failed: {e}")
490
+ #
491
+ #
492
+
493
+ # import os
494
+ # import asyncio
495
+ # from typing import List, Tuple, Callable, Optional
496
+ # import aiosqlite
497
+ # import numpy as np
498
+ # from sentence_transformers import SentenceTransformer
499
+ # import faiss
500
+ # import logging
501
+ # from dotenv import load_dotenv
502
+ #
503
+ # load_dotenv()
504
+ #
505
+ # logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
506
+ # logger = logging.getLogger(__name__)
507
+ #
508
+ #
509
+ # class RAGException(Exception):
510
+ # """Custom exception class for RAG-related errors"""
511
+ # pass
512
+ #
513
+ #
514
+ # class AsyncFAISSRAGSystem:
515
+ # def __init__(self, db_path: str, model_name: Optional[str] = None):
516
+ # self.db_path = db_path
517
+ # self.model_name = model_name or os.getenv('DEFAULT_MODEL_NAME', 'all-MiniLM-L6-v2')
518
+ # try:
519
+ # self.model = SentenceTransformer(self.model_name)
520
+ # logger.info(f"Initialized SentenceTransformer with model: {self.model_name}")
521
+ # except Exception as e:
522
+ # logger.error(f"Failed to initialize SentenceTransformer: {e}")
523
+ # raise RAGException(f"Model initialization failed: {e}")
524
+ #
525
+ # self.index = None
526
+ # self.document_lookup = {}
527
+ #
528
+ # async def init_db(self):
529
+ # try:
530
+ # async with aiosqlite.connect(self.db_path) as db:
531
+ # await db.execute('''
532
+ # CREATE TABLE IF NOT EXISTS documents (
533
+ # id INTEGER PRIMARY KEY,
534
+ # title TEXT,
535
+ # content TEXT
536
+ # )
537
+ # ''')
538
+ # await db.commit()
539
+ # logger.info("Initialized database schema")
540
+ # except aiosqlite.Error as e:
541
+ # logger.error(f"Failed to initialize database schema: {e}")
542
+ # raise RAGException(f"Database schema initialization failed: {e}")
543
+ #
544
+ # async def add_documents(self, documents: List[Tuple[str, str, str]]):
545
+ # try:
546
+ # embeddings = self.model.encode([content for _, content, _ in documents])
547
+ # async with aiosqlite.connect(self.db_path) as db:
548
+ # await db.executemany(
549
+ # 'INSERT INTO documents (title, content, embedding, source) VALUES (?, ?, ?, ?)',
550
+ # [(title, content, embedding.tobytes(), source) for (title, content, source), embedding in
551
+ # zip(documents, embeddings)]
552
+ # )
553
+ # await db.commit()
554
+ # logger.info(f"Added {len(documents)} documents in batch")
555
+ # except Exception as e:
556
+ # logger.error(f"Failed to add documents in batch: {e}")
557
+ # raise RAGException(f"Batch document addition failed: {e}")
558
+ #
559
+ # async def get_relevant_documents(self, query: str, top_k: int = 3) -> List[Tuple[int, str, str, float, str]]:
560
+ # try:
561
+ # query_embedding = self.model.encode([query])[0]
562
+ # documents = await self.get_documents()
563
+ # similarities = [
564
+ # (id, title, content, cosine_similarity([query_embedding], [doc_embedding])[0][0], source)
565
+ # for id, title, content, doc_embedding, source in documents
566
+ # ]
567
+ # similarities.sort(key=lambda x: x[3], reverse=True)
568
+ # logger.info(f"Retrieved top {top_k} relevant documents for query")
569
+ # return similarities[:top_k]
570
+ # except Exception as e:
571
+ # logger.error(f"Error in getting relevant documents: {e}")
572
+ # raise RAGException(f"Retrieval of relevant documents failed: {e}")
573
+ #
574
+ # async def rag_query(self, query: str, llm_function: Callable[[str], str], top_k: int = 3) -> str:
575
+ # try:
576
+ # relevant_docs = await self.get_relevant_documents(query, top_k)
577
+ # context = "\n\n".join([f"Title: {title}\nContent: {content}" for _, title, content, _ in relevant_docs])
578
+ #
579
+ # llm_prompt = f"Based on the following context, please answer the query:\n\nContext:\n{context}\n\nQuery: {query}"
580
+ #
581
+ # response = llm_function(llm_prompt)
582
+ # logger.info("Generated response for query")
583
+ # return response
584
+ # except Exception as e:
585
+ # logger.error(f"Error in RAG query: {e}")
586
+ # raise RAGException(f"RAG query failed: {e}")
587
+ #
588
+ #
589
+ # class AsyncFAISSHyDERAGSystem(AsyncFAISSRAGSystem):
590
+ # async def generate_hypothetical_document(self, query: str, llm_function: Callable[[str], str]) -> str:
591
+ # try:
592
+ # prompt = f"Given the question '{query}', write a short paragraph that would answer this question. Do not include the question itself in your response."
593
+ # hypothetical_doc = llm_function(prompt)
594
+ # logger.info("Generated hypothetical document")
595
+ # return hypothetical_doc
596
+ # except Exception as e:
597
+ # logger.error(f"Error generating hypothetical document: {e}")
598
+ # raise RAGException(f"Hypothetical document generation failed: {e}")
599
+ #
600
+ # async def get_relevant_documents(self, query: str, llm_function: Callable[[str], str], top_k: int = 3) -> List[
601
+ # Tuple[int, str, str, float]]:
602
+ # try:
603
+ # hypothetical_doc = await self.generate_hypothetical_document(query, llm_function)
604
+ # hyde_embedding = self.model.encode([hypothetical_doc])[0]
605
+ #
606
+ # distances, indices = self.index.search(np.array([hyde_embedding]), top_k)
607
+ #
608
+ # results = []
609
+ # for i, idx in enumerate(indices[0]):
610
+ # doc_id = list(self.document_lookup.keys())[idx]
611
+ # title, content = self.document_lookup[doc_id]
612
+ # results.append((doc_id, title, content, distances[0][i]))
613
+ #
614
+ # logger.info(f"Retrieved top {top_k} relevant documents using HyDE")
615
+ # return results
616
+ # except Exception as e:
617
+ # logger.error(f"Error in getting relevant documents with HyDE: {e}")
618
+ # raise RAGException(f"HyDE retrieval of relevant documents failed: {e}")
619
+ #
620
+ #
621
+ # # Example usage
622
+ # def mock_llm(prompt: str) -> str:
623
+ # if "write a short paragraph" in prompt:
624
+ # return "Paris, the capital of France, is renowned for its iconic Eiffel Tower and rich cultural heritage."
625
+ # else:
626
+ # return f"This is a mock LLM response for the prompt: {prompt}"
627
+ #
628
+ #
629
+ # async def main():
630
+ # use_hyde = False # Set this to True when you want to enable HyDE
631
+ #
632
+ # try:
633
+ # if use_hyde:
634
+ # rag_system = AsyncFAISSHyDERAGSystem('rag_database.db')
635
+ # logger.info("Using Async FAISS HyDE RAG System")
636
+ # else:
637
+ # rag_system = AsyncFAISSRAGSystem('rag_database.db')
638
+ # logger.info("Using Async FAISS RAG System")
639
+ #
640
+ # await rag_system.init_db()
641
+ #
642
+ # # Add sample documents
643
+ # sample_docs = [
644
+ # ("Paris", "Paris is the capital of France and is known for the Eiffel Tower."),
645
+ # ("London", "London is the capital of the United Kingdom and home to Big Ben."),
646
+ # ("Tokyo", "Tokyo is the capital of Japan and is famous for its bustling city life.")
647
+ # ]
648
+ #
649
+ # await rag_system.add_documents(sample_docs)
650
+ #
651
+ # query = "What is the capital of France?"
652
+ # result = await rag_system.rag_query(query, mock_llm)
653
+ # print(f"Query: {query}")
654
+ # print(f"Result: {result}")
655
+ #
656
+ # except RAGException as e:
657
+ # logger.error(f"RAG system error: {e}")
658
+ # print(f"An error occurred: {e}")
659
+ # except Exception as e:
660
+ # logger.error(f"Unexpected error: {e}")
661
+ # print(f"An unexpected error occurred: {e}")
662
+ #
663
+ #
664
+ # if __name__ == "__main__":
665
+ # asyncio.run(main())
666
+
667
+
668
+ """
669
+ Key changes in this FAISS-integrated version:
670
+
671
+ We've replaced the cosine similarity search with FAISS indexing and search.
672
+ The add_documents method now adds embeddings to the FAISS index as well as storing documents in the SQLite database.
673
+ We maintain a document_lookup dictionary to quickly retrieve document content based on FAISS search results.
674
+ The get_relevant_documents method now uses FAISS for similarity search instead of computing cosine similarities manually.
675
+ We've kept the asynchronous structure for database operations, while FAISS operations remain synchronous (as FAISS doesn't have built-in async support).
676
+
677
+ Benefits of using FAISS:
678
+
679
+ Scalability: FAISS can handle millions of vectors efficiently, making it suitable for large document collections.
680
+ Speed: FAISS is optimized for fast similarity search, which can significantly improve query times as your dataset grows.
681
+ Memory Efficiency: FAISS provides various index types that can trade off between search accuracy and memory usage, allowing you to optimize for your specific use case.
682
+
683
+ Considerations:
684
+
685
+ This implementation uses a simple IndexFlatL2 FAISS index, which performs exact search. For larger datasets, you might want to consider approximate search methods like IndexIVFFlat for better scalability.
686
+ The current implementation keeps all document content in memory (in the document_lookup dictionary). For very large datasets, you might want to modify this to fetch document content from the database as needed.
687
+ If you're dealing with a very large number of documents, you might want to implement batch processing for adding documents to the FAISS index.
688
+
689
+ This FAISS-integrated version should provide better performance for similarity search, especially as your document collection grows larger
690
+ """
691
+
692
+
693
+ ###############################################################################################################
694
+ # Web Search
695
+ # Output from Sonnet 3.5 regarding how to add web searches to the RAG system
696
+ # Integrating web search into your RAG system can significantly enhance its capabilities by providing up-to-date information. Here's how you can modify your RAG system to include web search:
697
+ #
698
+ # First, you'll need to choose a web search API. Some popular options include:
699
+ #
700
+ # Google Custom Search API
701
+ # Bing Web Search API
702
+ # DuckDuckGo API
703
+ # SerpAPI (which can interface with multiple search engines)
704
+ #
705
+ #
706
+ #
707
+ # For this example, let's use the DuckDuckGo API, as it's free and doesn't require authentication.
708
+ #
709
+ # Install the required library:
710
+ # `pip install duckduckgo-search`
711
+ #
712
+ # Add a new method to your RAG system for web search:
713
+ # ```
714
+ # from duckduckgo_search import ddg
715
+ #
716
+ # class AsyncRAGSystem:
717
+ # # ... (existing code) ...
718
+ #
719
+ # async def web_search(self, query: str, num_results: int = 3) -> List[Dict[str, str]]:
720
+ # try:
721
+ # results = ddg(query, max_results=num_results)
722
+ # return [{'title': r['title'], 'content': r['body'], 'source': r['href']} for r in results]
723
+ # except Exception as e:
724
+ # logger.error(f"Error in web search: {e}")
725
+ # raise RAGException(f"Web search failed: {e}")
726
+ #
727
+ # async def add_web_results_to_db(self, results: List[Dict[str, str]]):
728
+ # try:
729
+ # documents = [(r['title'], r['content'], r['source']) for r in results]
730
+ # await self.add_documents(documents)
731
+ # logger.info(f"Added {len(documents)} web search results to the database")
732
+ # except Exception as e:
733
+ # logger.error(f"Error adding web search results to database: {e}")
734
+ # raise RAGException(f"Adding web search results failed: {e}")
735
+ #
736
+ # async def rag_query_with_web_search(self, query: str, llm_function: Callable[[str], str], top_k: int = 3,
737
+ # use_web_search: bool = True, num_web_results: int = 3) -> str:
738
+ # try:
739
+ # if use_web_search:
740
+ # web_results = await self.web_search(query, num_web_results)
741
+ # await self.add_web_results_to_db(web_results)
742
+ #
743
+ # relevant_docs = await self.get_relevant_documents(query, top_k)
744
+ # context = "\n\n".join([f"Title: {title}\nContent: {content}\nSource: {source}"
745
+ # for _, title, content, _, source in relevant_docs])
746
+ #
747
+ # llm_prompt = f"Based on the following context, please answer the query. Include citations in your response using [Source] format:\n\nContext:\n{context}\n\nQuery: {query}"
748
+ #
749
+ # response = llm_function(llm_prompt)
750
+ # logger.info("Generated response for query with web search")
751
+ # return response
752
+ # except Exception as e:
753
+ # logger.error(f"Error in RAG query with web search: {e}")
754
+ # raise RAGException(f"RAG query with web search failed: {e}")
755
+ # ```
756
+ #
757
+ # Update your main function to use the new web search capability:
758
+ # ```
759
+ # async def main():
760
+ # use_hyde = False # Set this to True when you want to enable HyDE
761
+ # use_web_search = True # Set this to False if you don't want to use web search
762
+ #
763
+ # try:
764
+ # if use_hyde:
765
+ # rag_system = AsyncHyDERAGSystem('rag_database.db')
766
+ # logger.info("Using Async HyDE RAG System")
767
+ # else:
768
+ # rag_system = AsyncStandardRAGSystem('rag_database.db')
769
+ # logger.info("Using Async Standard RAG System")
770
+ #
771
+ # await rag_system.init_db()
772
+ #
773
+ # # Add sample documents
774
+ # sample_docs = [
775
+ # ("Paris", "Paris is the capital of France and is known for the Eiffel Tower.", "Local Database"),
776
+ # ("London", "London is the capital of the United Kingdom and home to Big Ben.", "Local Database"),
777
+ # ("Tokyo", "Tokyo is the capital of Japan and is famous for its bustling city life.", "Local Database")
778
+ # ]
779
+ #
780
+ # await rag_system.add_documents(sample_docs)
781
+ #
782
+ # query = "What is the capital of France?"
783
+ # result = await rag_system.rag_query_with_web_search(query, mock_llm, use_web_search=use_web_search)
784
+ # print(f"Query: {query}")
785
+ # print(f"Result: {result}")
786
+ #
787
+ # except RAGException as e:
788
+ # logger.error(f"RAG system error: {e}")
789
+ # print(f"An error occurred: {e}")
790
+ # except Exception as e:
791
+ # logger.error(f"Unexpected error: {e}")
792
+ # print(f"An unexpected error occurred: {e}")
793
+ # ```
794
+ #
795
+ #
796
+ # This implementation does the following:
797
+ #
798
+ # It adds a web_search method that uses the DuckDuckGo API to perform web searches.
799
+ # It adds an add_web_results_to_db method that adds the web search results to your existing database.
800
+ # It modifies the rag_query method (now called rag_query_with_web_search) to optionally perform a web search before retrieving relevant documents.
801
+ #
802
+ # When use_web_search is set to True, the system will:
803
+ #
804
+ # Perform a web search for the given query.
805
+ # Add the web search results to the database.
806
+ # Retrieve relevant documents (which now may include the newly added web search results).
807
+ # Use these documents to generate a response.
808
+ #
809
+ # This approach allows your RAG system to combine information from your existing database with fresh information from the web, potentially providing more up-to-date and comprehensive answers.
810
+ # Remember to handle rate limiting and respect the terms of service of the web search API you choose to use. Also, be aware that adding web search results to your database will increase its size over time, so you may need to implement a strategy to manage this growth (e.g., removing old web search results periodically).
811
+
812
+
App_Function_Libraries/SQLite_DB.py ADDED
@@ -0,0 +1,973 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # SQLite_DB.py
2
+ #########################################
3
+ # SQLite_DB Library
4
+ # This library is used to perform any/all DB operations related to SQLite.
5
+ #
6
+ ####
7
+
8
+ ####################
9
+ # Function List
10
+ # FIXME - UPDATE Function Arguments
11
+ # 1. get_connection(self)
12
+ # 2. execute_query(self, query: str, params: Tuple = ())
13
+ # 3. create_tables()
14
+ # 4. add_keyword(keyword: str)
15
+ # 5. delete_keyword(keyword: str)
16
+ # 6. add_media_with_keywords(url, title, media_type, content, keywords, prompt, summary, transcription_model, author, ingestion_date)
17
+ # 7. fetch_all_keywords()
18
+ # 8. keywords_browser_interface()
19
+ # 9. display_keywords()
20
+ # 10. export_keywords_to_csv()
21
+ # 11. browse_items(search_query, search_type)
22
+ # 12. fetch_item_details(media_id: int)
23
+ # 13. add_media_version(media_id: int, prompt: str, summary: str)
24
+ # 14. search_db(search_query: str, search_fields: List[str], keywords: str, page: int = 1, results_per_page: int = 10)
25
+ # 15. search_and_display(search_query, search_fields, keywords, page)
26
+ # 16. display_details(index, results)
27
+ # 17. get_details(index, dataframe)
28
+ # 18. format_results(results)
29
+ # 19. export_to_csv(search_query: str, search_fields: List[str], keyword: str, page: int = 1, results_per_file: int = 1000)
30
+ # 20. is_valid_url(url: str) -> bool
31
+ # 21. is_valid_date(date_string: str) -> bool
32
+ # 22. add_media_to_database(url, info_dict, segments, summary, keywords, custom_prompt_input, whisper_model)
33
+ # 23. create_prompts_db()
34
+ # 24. add_prompt(name, details, system, user=None)
35
+ # 25. fetch_prompt_details(name)
36
+ # 26. list_prompts()
37
+ # 27. insert_prompt_to_db(title, description, system_prompt, user_prompt)
38
+ # 28. update_media_content(media_id: int, content: str, prompt: str, summary: str)
39
+ # 29. search_media_database(query: str) -> List[Tuple[int, str, str]]
40
+ # 30. load_media_content(media_id: int)
41
+ # 31.
42
+ # 32.
43
+ #
44
+ #
45
+ #####################
46
+ #
47
+ # Import necessary libraries
48
+ import csv
49
+ import logging
50
+ import os
51
+ import re
52
+ import sqlite3
53
+ import time
54
+ from contextlib import contextmanager
55
+ from datetime import datetime
56
+ from typing import List, Tuple
57
+ # Third-Party Libraries
58
+ import gradio as gr
59
+ import pandas as pd
60
+ # Import Local Libraries
61
+ #
62
+ #######################################################################################################################
63
+ # Function Definitions
64
+ #
65
+
66
+ # Set up logging
67
+ #logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
68
+ #logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')
69
+ logger = logging.getLogger(__name__)
70
+
71
+
72
+ # Custom exceptions
73
+ class DatabaseError(Exception):
74
+ pass
75
+
76
+
77
+ class InputError(Exception):
78
+ pass
79
+
80
+
81
+ # Database connection function with connection pooling
82
+ class Database:
83
+ def __init__(self, db_name=None):
84
+ self.db_name = db_name or os.getenv('DB_NAME', 'media_summary.db')
85
+ self.pool = []
86
+ self.pool_size = 10
87
+
88
+ @contextmanager
89
+ def get_connection(self):
90
+ retry_count = 5
91
+ retry_delay = 1
92
+ conn = None
93
+ while retry_count > 0:
94
+ try:
95
+ conn = self.pool.pop() if self.pool else sqlite3.connect(self.db_name, check_same_thread=False)
96
+ yield conn
97
+ self.pool.append(conn)
98
+ return
99
+ except sqlite3.OperationalError as e:
100
+ if 'database is locked' in str(e):
101
+ logging.warning(f"Database is locked, retrying in {retry_delay} seconds...")
102
+ retry_count -= 1
103
+ time.sleep(retry_delay)
104
+ else:
105
+ raise DatabaseError(f"Database error: {e}")
106
+ except Exception as e:
107
+ raise DatabaseError(f"Unexpected error: {e}")
108
+ finally:
109
+ # Ensure the connection is returned to the pool even on failure
110
+ if conn:
111
+ self.pool.append(conn)
112
+ raise DatabaseError("Database is locked and retries have been exhausted")
113
+
114
+ def execute_query(self, query: str, params: Tuple = ()) -> None:
115
+ with self.get_connection() as conn:
116
+ try:
117
+ cursor = conn.cursor()
118
+ cursor.execute(query, params)
119
+ conn.commit()
120
+ except sqlite3.Error as e:
121
+ raise DatabaseError(f"Database error: {e}, Query: {query}")
122
+
123
+ db = Database()
124
+
125
+
126
+ # Function to create tables with the new media schema
127
+ def create_tables() -> None:
128
+ table_queries = [
129
+ '''
130
+ CREATE TABLE IF NOT EXISTS Media (
131
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
132
+ url TEXT,
133
+ title TEXT NOT NULL,
134
+ type TEXT NOT NULL,
135
+ content TEXT,
136
+ author TEXT,
137
+ ingestion_date TEXT,
138
+ prompt TEXT,
139
+ summary TEXT,
140
+ transcription_model TEXT
141
+ )
142
+ ''',
143
+ '''
144
+ CREATE TABLE IF NOT EXISTS Keywords (
145
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
146
+ keyword TEXT NOT NULL UNIQUE
147
+ )
148
+ ''',
149
+ '''
150
+ CREATE TABLE IF NOT EXISTS MediaKeywords (
151
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
152
+ media_id INTEGER NOT NULL,
153
+ keyword_id INTEGER NOT NULL,
154
+ FOREIGN KEY (media_id) REFERENCES Media(id),
155
+ FOREIGN KEY (keyword_id) REFERENCES Keywords(id)
156
+ )
157
+ ''',
158
+ '''
159
+ CREATE TABLE IF NOT EXISTS MediaVersion (
160
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
161
+ media_id INTEGER NOT NULL,
162
+ version INTEGER NOT NULL,
163
+ prompt TEXT,
164
+ summary TEXT,
165
+ created_at TEXT NOT NULL,
166
+ FOREIGN KEY (media_id) REFERENCES Media(id)
167
+ )
168
+ ''',
169
+ '''
170
+ CREATE TABLE IF NOT EXISTS MediaModifications (
171
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
172
+ media_id INTEGER NOT NULL,
173
+ prompt TEXT,
174
+ summary TEXT,
175
+ modification_date TEXT,
176
+ FOREIGN KEY (media_id) REFERENCES Media(id)
177
+ )
178
+ ''',
179
+ '''
180
+ CREATE VIRTUAL TABLE IF NOT EXISTS media_fts USING fts5(title, content);
181
+ ''',
182
+ '''
183
+ CREATE VIRTUAL TABLE IF NOT EXISTS keyword_fts USING fts5(keyword);
184
+ ''',
185
+ '''
186
+ CREATE INDEX IF NOT EXISTS idx_media_title ON Media(title);
187
+ ''',
188
+ '''
189
+ CREATE INDEX IF NOT EXISTS idx_media_type ON Media(type);
190
+ ''',
191
+ '''
192
+ CREATE INDEX IF NOT EXISTS idx_media_author ON Media(author);
193
+ ''',
194
+ '''
195
+ CREATE INDEX IF NOT EXISTS idx_media_ingestion_date ON Media(ingestion_date);
196
+ ''',
197
+ '''
198
+ CREATE INDEX IF NOT EXISTS idx_keywords_keyword ON Keywords(keyword);
199
+ ''',
200
+ '''
201
+ CREATE INDEX IF NOT EXISTS idx_mediakeywords_media_id ON MediaKeywords(media_id);
202
+ ''',
203
+ '''
204
+ CREATE INDEX IF NOT EXISTS idx_mediakeywords_keyword_id ON MediaKeywords(keyword_id);
205
+ ''',
206
+ '''
207
+ CREATE INDEX IF NOT EXISTS idx_media_version_media_id ON MediaVersion(media_id);
208
+ ''',
209
+ '''
210
+ CREATE INDEX IF NOT EXISTS idx_mediamodifications_media_id ON MediaModifications(media_id);
211
+ ''',
212
+ '''
213
+ CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_media_url ON Media(url);
214
+ ''',
215
+ '''
216
+ CREATE UNIQUE INDEX IF NOT EXISTS idx_unique_media_keyword ON MediaKeywords(media_id, keyword_id);
217
+ '''
218
+ ]
219
+ for query in table_queries:
220
+ db.execute_query(query)
221
+
222
+ logging.info("All tables and indexes created successfully.")
223
+
224
+ create_tables()
225
+
226
+
227
+ #######################################################################################################################
228
+ # Keyword-related Functions
229
+ #
230
+
231
+ # Function to add a keyword
232
+ def add_keyword(keyword: str) -> int:
233
+ keyword = keyword.strip().lower()
234
+ with db.get_connection() as conn:
235
+ cursor = conn.cursor()
236
+ try:
237
+ cursor.execute('INSERT OR IGNORE INTO Keywords (keyword) VALUES (?)', (keyword,))
238
+ cursor.execute('SELECT id FROM Keywords WHERE keyword = ?', (keyword,))
239
+ keyword_id = cursor.fetchone()[0]
240
+ cursor.execute('INSERT OR IGNORE INTO keyword_fts (rowid, keyword) VALUES (?, ?)', (keyword_id, keyword))
241
+ logging.info(f"Keyword '{keyword}' added to keyword_fts with ID: {keyword_id}")
242
+ conn.commit()
243
+ return keyword_id
244
+ except sqlite3.IntegrityError as e:
245
+ logging.error(f"Integrity error adding keyword: {e}")
246
+ raise DatabaseError(f"Integrity error adding keyword: {e}")
247
+ except sqlite3.Error as e:
248
+ logging.error(f"Error adding keyword: {e}")
249
+ raise DatabaseError(f"Error adding keyword: {e}")
250
+
251
+
252
+ # Function to delete a keyword
253
+ def delete_keyword(keyword: str) -> str:
254
+ keyword = keyword.strip().lower()
255
+ with db.get_connection() as conn:
256
+ cursor = conn.cursor()
257
+ try:
258
+ cursor.execute('SELECT id FROM Keywords WHERE keyword = ?', (keyword,))
259
+ keyword_id = cursor.fetchone()
260
+ if keyword_id:
261
+ cursor.execute('DELETE FROM Keywords WHERE keyword = ?', (keyword,))
262
+ cursor.execute('DELETE FROM keyword_fts WHERE rowid = ?', (keyword_id[0],))
263
+ conn.commit()
264
+ return f"Keyword '{keyword}' deleted successfully."
265
+ else:
266
+ return f"Keyword '{keyword}' not found."
267
+ except sqlite3.Error as e:
268
+ raise DatabaseError(f"Error deleting keyword: {e}")
269
+
270
+
271
+
272
+ # Function to add media with keywords
273
+ def add_media_with_keywords(url, title, media_type, content, keywords, prompt, summary, transcription_model, author,
274
+ ingestion_date):
275
+ # Set default values for missing fields
276
+ url = url or 'Unknown'
277
+ title = title or 'Untitled'
278
+ media_type = media_type or 'Unknown'
279
+ content = content or 'No content available'
280
+ keywords = keywords or 'default'
281
+ prompt = prompt or 'No prompt available'
282
+ summary = summary or 'No summary available'
283
+ transcription_model = transcription_model or 'Unknown'
284
+ author = author or 'Unknown'
285
+ ingestion_date = ingestion_date or datetime.now().strftime('%Y-%m-%d')
286
+
287
+ # Ensure URL is valid
288
+ if not is_valid_url(url):
289
+ url = 'localhost'
290
+
291
+ if media_type not in ['article', 'audio', 'document', 'obsidian_note', 'podcast', 'text', 'video', 'unknown']:
292
+ raise InputError("Invalid media type. Allowed types: article, audio file, document, obsidian_note podcast, text, video, unknown.")
293
+
294
+ if ingestion_date and not is_valid_date(ingestion_date):
295
+ raise InputError("Invalid ingestion date format. Use YYYY-MM-DD.")
296
+
297
+ # Handle keywords as either string or list
298
+ if isinstance(keywords, str):
299
+ keyword_list = [keyword.strip().lower() for keyword in keywords.split(',')]
300
+ elif isinstance(keywords, list):
301
+ keyword_list = [keyword.strip().lower() for keyword in keywords]
302
+ else:
303
+ keyword_list = ['default']
304
+
305
+ logging.info(f"Adding/updating media: URL={url}, Title={title}, Type={media_type}")
306
+ logging.debug(f"Content (first 500 chars): {content[:500]}...")
307
+ logging.debug(f"Keywords: {keyword_list}")
308
+ logging.info(f"Prompt: {prompt}")
309
+ logging.info(f"Summary: {summary}")
310
+ logging.info(f"Author: {author}")
311
+ logging.info(f"Ingestion Date: {ingestion_date}")
312
+ logging.info(f"Transcription Model: {transcription_model}")
313
+
314
+ try:
315
+ with db.get_connection() as conn:
316
+ conn.execute("BEGIN TRANSACTION")
317
+ cursor = conn.cursor()
318
+
319
+ # Check if media already exists
320
+ cursor.execute('SELECT id FROM Media WHERE url = ?', (url,))
321
+ existing_media = cursor.fetchone()
322
+
323
+ if existing_media:
324
+ media_id = existing_media[0]
325
+ logging.info(f"Updating existing media with ID: {media_id}")
326
+
327
+ cursor.execute('''
328
+ UPDATE Media
329
+ SET content = ?, transcription_model = ?, title = ?, type = ?, author = ?, ingestion_date = ?
330
+ WHERE id = ?
331
+ ''', (content, transcription_model, title, media_type, author, ingestion_date, media_id))
332
+ else:
333
+ logging.info("Creating new media entry")
334
+
335
+ cursor.execute('''
336
+ INSERT INTO Media (url, title, type, content, author, ingestion_date, transcription_model)
337
+ VALUES (?, ?, ?, ?, ?, ?, ?)
338
+ ''', (url, title, media_type, content, author, ingestion_date, transcription_model))
339
+ media_id = cursor.lastrowid
340
+
341
+ logging.info(f"Adding new modification to MediaModifications for media ID: {media_id}")
342
+ cursor.execute('''
343
+ INSERT INTO MediaModifications (media_id, prompt, summary, modification_date)
344
+ VALUES (?, ?, ?, ?)
345
+ ''', (media_id, prompt, summary, ingestion_date))
346
+ logger.info("New modification added to MediaModifications")
347
+
348
+ # Insert keywords and associate with media item
349
+ logging.info("Processing keywords")
350
+ for keyword in keyword_list:
351
+ keyword = keyword.strip().lower()
352
+ cursor.execute('INSERT OR IGNORE INTO Keywords (keyword) VALUES (?)', (keyword,))
353
+ cursor.execute('SELECT id FROM Keywords WHERE keyword = ?', (keyword,))
354
+ keyword_id = cursor.fetchone()[0]
355
+ cursor.execute('INSERT OR IGNORE INTO MediaKeywords (media_id, keyword_id) VALUES (?, ?)',
356
+ (media_id, keyword_id))
357
+
358
+ # Update full-text search index
359
+ logging.info("Updating full-text search index")
360
+ cursor.execute('INSERT OR REPLACE INTO media_fts (rowid, title, content) VALUES (?, ?, ?)',
361
+ (media_id, title, content))
362
+
363
+ logging.info("Adding new media version")
364
+ add_media_version(media_id, prompt, summary)
365
+
366
+ conn.commit()
367
+ logging.info(f"Media '{title}' successfully added/updated with ID: {media_id}")
368
+
369
+ return f"Media '{title}' added/updated successfully with keywords: {', '.join(keyword_list)}"
370
+
371
+ except sqlite3.Error as e:
372
+ conn.rollback()
373
+ logging.error(f"SQL Error: {e}")
374
+ raise DatabaseError(f"Error adding media with keywords: {e}")
375
+ except Exception as e:
376
+ conn.rollback()
377
+ logging.error(f"Unexpected Error: {e}")
378
+ raise DatabaseError(f"Unexpected error: {e}")
379
+
380
+
381
+ def fetch_all_keywords() -> List[str]:
382
+ try:
383
+ with db.get_connection() as conn:
384
+ cursor = conn.cursor()
385
+ cursor.execute('SELECT keyword FROM Keywords')
386
+ keywords = [row[0] for row in cursor.fetchall()]
387
+ return keywords
388
+ except sqlite3.Error as e:
389
+ raise DatabaseError(f"Error fetching keywords: {e}")
390
+
391
+ def keywords_browser_interface():
392
+ keywords = fetch_all_keywords()
393
+ return gr.Markdown("\n".join(f"- {keyword}" for keyword in keywords))
394
+
395
+ def display_keywords():
396
+ try:
397
+ keywords = fetch_all_keywords()
398
+ return "\n".join(keywords) if keywords else "No keywords found."
399
+ except DatabaseError as e:
400
+ return str(e)
401
+
402
+
403
+ def export_keywords_to_csv():
404
+ try:
405
+ keywords = fetch_all_keywords()
406
+ if not keywords:
407
+ return None, "No keywords found in the database."
408
+
409
+ filename = "keywords.csv"
410
+ with open(filename, 'w', newline='', encoding='utf-8') as file:
411
+ writer = csv.writer(file)
412
+ writer.writerow(["Keyword"])
413
+ for keyword in keywords:
414
+ writer.writerow([keyword])
415
+
416
+ return filename, f"Keywords exported to {filename}"
417
+ except Exception as e:
418
+ logger.error(f"Error exporting keywords to CSV: {e}")
419
+ return None, f"Error exporting keywords: {e}"
420
+
421
+
422
+ # Function to fetch items based on search query and type
423
+ def browse_items(search_query, search_type):
424
+ try:
425
+ with db.get_connection() as conn:
426
+ cursor = conn.cursor()
427
+ if search_type == 'Title':
428
+ cursor.execute("SELECT id, title, url FROM Media WHERE title LIKE ?", (f'%{search_query}%',))
429
+ elif search_type == 'URL':
430
+ cursor.execute("SELECT id, title, url FROM Media WHERE url LIKE ?", (f'%{search_query}%',))
431
+ results = cursor.fetchall()
432
+ return results
433
+ except sqlite3.Error as e:
434
+ raise Exception(f"Error fetching items by {search_type}: {e}")
435
+
436
+
437
+ # Function to fetch item details
438
+ def fetch_item_details(media_id: int):
439
+ try:
440
+ with db.get_connection() as conn:
441
+ cursor = conn.cursor()
442
+ cursor.execute("""
443
+ SELECT prompt, summary
444
+ FROM MediaModifications
445
+ WHERE media_id = ?
446
+ ORDER BY modification_date DESC
447
+ LIMIT 1
448
+ """, (media_id,))
449
+ prompt_summary_result = cursor.fetchone()
450
+ cursor.execute("SELECT content FROM Media WHERE id = ?", (media_id,))
451
+ content_result = cursor.fetchone()
452
+
453
+ prompt = prompt_summary_result[0] if prompt_summary_result else ""
454
+ summary = prompt_summary_result[1] if prompt_summary_result else ""
455
+ content = content_result[0] if content_result else ""
456
+
457
+ return content, prompt, summary
458
+ except sqlite3.Error as e:
459
+ logging.error(f"Error fetching item details: {e}")
460
+ return "", "", "" # Return empty strings if there's an error
461
+
462
+ #
463
+ #
464
+ #######################################################################################################################
465
+
466
+
467
+
468
+
469
+ # Function to add a version of a prompt and summary
470
+ def add_media_version(media_id: int, prompt: str, summary: str) -> None:
471
+ try:
472
+ with db.get_connection() as conn:
473
+ cursor = conn.cursor()
474
+
475
+ # Get the current version number
476
+ cursor.execute('SELECT MAX(version) FROM MediaVersion WHERE media_id = ?', (media_id,))
477
+ current_version = cursor.fetchone()[0] or 0
478
+
479
+ # Insert the new version
480
+ cursor.execute('''
481
+ INSERT INTO MediaVersion (media_id, version, prompt, summary, created_at)
482
+ VALUES (?, ?, ?, ?, ?)
483
+ ''', (media_id, current_version + 1, prompt, summary, datetime.now().strftime('%Y-%m-%d %H:%M:%S')))
484
+ conn.commit()
485
+ except sqlite3.Error as e:
486
+ raise DatabaseError(f"Error adding media version: {e}")
487
+
488
+
489
+ # Function to search the database with advanced options, including keyword search and full-text search
490
+ def search_db(search_query: str, search_fields: List[str], keywords: str, page: int = 1, results_per_page: int = 10):
491
+ if page < 1:
492
+ raise ValueError("Page number must be 1 or greater.")
493
+
494
+ # Prepare keywords by splitting and trimming
495
+ keywords = [keyword.strip().lower() for keyword in keywords.split(',') if keyword.strip()]
496
+
497
+ with db.get_connection() as conn:
498
+ cursor = conn.cursor()
499
+ offset = (page - 1) * results_per_page
500
+
501
+ # Prepare the search conditions for general fields
502
+ search_conditions = []
503
+ params = []
504
+
505
+ for field in search_fields:
506
+ if search_query: # Ensure there's a search query before adding this condition
507
+ search_conditions.append(f"Media.{field} LIKE ?")
508
+ params.append(f'%{search_query}%')
509
+
510
+ # Prepare the conditions for keywords filtering
511
+ keyword_conditions = []
512
+ for keyword in keywords:
513
+ keyword_conditions.append(
514
+ f"EXISTS (SELECT 1 FROM MediaKeywords mk JOIN Keywords k ON mk.keyword_id = k.id WHERE mk.media_id = Media.id AND k.keyword LIKE ?)")
515
+ params.append(f'%{keyword}%')
516
+
517
+ # Combine all conditions
518
+ where_clause = " AND ".join(
519
+ search_conditions + keyword_conditions) if search_conditions or keyword_conditions else "1=1"
520
+
521
+ # Complete the query
522
+ query = f'''
523
+ SELECT DISTINCT Media.id, Media.url, Media.title, Media.type, Media.content, Media.author, Media.ingestion_date,
524
+ MediaModifications.prompt, MediaModifications.summary
525
+ FROM Media
526
+ LEFT JOIN MediaModifications ON Media.id = MediaModifications.media_id
527
+ WHERE {where_clause}
528
+ ORDER BY Media.ingestion_date DESC
529
+ LIMIT ? OFFSET ?
530
+ '''
531
+ params.extend([results_per_page, offset])
532
+
533
+ cursor.execute(query, params)
534
+ results = cursor.fetchall()
535
+
536
+ return results
537
+
538
+
539
+ # Gradio function to handle user input and display results with pagination, with better feedback
540
+ def search_and_display(search_query, search_fields, keywords, page):
541
+ results = search_db(search_query, search_fields, keywords, page)
542
+
543
+ if isinstance(results, pd.DataFrame):
544
+ # Convert DataFrame to a list of tuples or lists
545
+ processed_results = results.values.tolist() # This converts DataFrame rows to lists
546
+ elif isinstance(results, list):
547
+ # Ensure that each element in the list is itself a list or tuple (not a dictionary)
548
+ processed_results = [list(item.values()) if isinstance(item, dict) else item for item in results]
549
+ else:
550
+ raise TypeError("Unsupported data type for results")
551
+
552
+ return processed_results
553
+
554
+
555
+ def display_details(index, results):
556
+ if index is None or results is None:
557
+ return "Please select a result to view details."
558
+
559
+ try:
560
+ # Ensure the index is an integer and access the row properly
561
+ index = int(index)
562
+ if isinstance(results, pd.DataFrame):
563
+ if index >= len(results):
564
+ return "Index out of range. Please select a valid index."
565
+ selected_row = results.iloc[index]
566
+ else:
567
+ # If results is not a DataFrame, but a list (assuming list of dicts)
568
+ selected_row = results[index]
569
+ except ValueError:
570
+ return "Index must be an integer."
571
+ except IndexError:
572
+ return "Index out of range. Please select a valid index."
573
+
574
+ # Build HTML output safely
575
+ details_html = f"""
576
+ <h3>{selected_row.get('Title', 'No Title')}</h3>
577
+ <p><strong>URL:</strong> {selected_row.get('URL', 'No URL')}</p>
578
+ <p><strong>Type:</strong> {selected_row.get('Type', 'No Type')}</p>
579
+ <p><strong>Author:</strong> {selected_row.get('Author', 'No Author')}</p>
580
+ <p><strong>Ingestion Date:</strong> {selected_row.get('Ingestion Date', 'No Date')}</p>
581
+ <p><strong>Prompt:</strong> {selected_row.get('Prompt', 'No Prompt')}</p>
582
+ <p><strong>Summary:</strong> {selected_row.get('Summary', 'No Summary')}</p>
583
+ <p><strong>Content:</strong> {selected_row.get('Content', 'No Content')}</p>
584
+ """
585
+ return details_html
586
+
587
+
588
+ def get_details(index, dataframe):
589
+ if index is None or dataframe is None or index >= len(dataframe):
590
+ return "Please select a result to view details."
591
+ row = dataframe.iloc[index]
592
+ details = f"""
593
+ <h3>{row['Title']}</h3>
594
+ <p><strong>URL:</strong> {row['URL']}</p>
595
+ <p><strong>Type:</strong> {row['Type']}</p>
596
+ <p><strong>Author:</strong> {row['Author']}</p>
597
+ <p><strong>Ingestion Date:</strong> {row['Ingestion Date']}</p>
598
+ <p><strong>Prompt:</strong> {row['Prompt']}</p>
599
+ <p><strong>Summary:</strong> {row['Summary']}</p>
600
+ <p><strong>Content:</strong></p>
601
+ <pre>{row['Content']}</pre>
602
+ """
603
+ return details
604
+
605
+
606
+ def format_results(results):
607
+ if not results:
608
+ return pd.DataFrame(columns=['URL', 'Title', 'Type', 'Content', 'Author', 'Ingestion Date', 'Prompt', 'Summary'])
609
+
610
+ df = pd.DataFrame(results, columns=['URL', 'Title', 'Type', 'Content', 'Author', 'Ingestion Date', 'Prompt', 'Summary'])
611
+ logging.debug(f"Formatted DataFrame: {df}")
612
+
613
+ return df
614
+
615
+
616
+ # Function to export search results to CSV or markdown with pagination
617
+ def export_to_file(search_query: str, search_fields: List[str], keyword: str, page: int = 1, results_per_file: int = 1000, export_format: str = 'csv'):
618
+ try:
619
+ results = search_db(search_query, search_fields, keyword, page, results_per_file)
620
+ if not results:
621
+ return "No results found to export."
622
+
623
+ # Create an 'exports' directory if it doesn't exist
624
+ if not os.path.exists('exports'):
625
+ os.makedirs('exports')
626
+
627
+ if export_format == 'csv':
628
+ filename = f'exports/search_results_page_{page}.csv'
629
+ with open(filename, 'w', newline='', encoding='utf-8') as file:
630
+ writer = csv.writer(file)
631
+ writer.writerow(['URL', 'Title', 'Type', 'Content', 'Author', 'Ingestion Date', 'Prompt', 'Summary'])
632
+ for row in results:
633
+ writer.writerow(row)
634
+ elif export_format == 'markdown':
635
+ filename = f'exports/search_results_page_{page}.md'
636
+ with open(filename, 'w', encoding='utf-8') as file:
637
+ for item in results:
638
+ markdown_content = convert_to_markdown({
639
+ 'title': item[1],
640
+ 'url': item[0],
641
+ 'type': item[2],
642
+ 'content': item[3],
643
+ 'author': item[4],
644
+ 'ingestion_date': item[5],
645
+ 'summary': item[7],
646
+ 'keywords': item[8].split(',') if item[8] else []
647
+ })
648
+ file.write(markdown_content)
649
+ file.write("\n---\n\n") # Separator between items
650
+ else:
651
+ return f"Unsupported export format: {export_format}"
652
+
653
+ return f"Results exported to {filename}"
654
+ except (DatabaseError, InputError) as e:
655
+ return str(e)
656
+
657
+
658
+ # Helper function to validate URL format
659
+ def is_valid_url(url: str) -> bool:
660
+ regex = re.compile(
661
+ r'^(?:http|ftp)s?://' # http:// or https://
662
+ r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain...
663
+ r'localhost|' # localhost...
664
+ r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4
665
+ r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6
666
+ r'(?::\d+)?' # optional port
667
+ r'(?:/?|[/?]\S+)$', re.IGNORECASE)
668
+ return re.match(regex, url) is not None
669
+
670
+
671
+ # Helper function to validate date format
672
+ def is_valid_date(date_string: str) -> bool:
673
+ try:
674
+ datetime.strptime(date_string, '%Y-%m-%d')
675
+ return True
676
+ except ValueError:
677
+ return False
678
+
679
+
680
+ # Add ingested media to DB
681
+ def add_media_to_database(url, info_dict, segments, summary, keywords, custom_prompt_input, whisper_model, media_type='video'):
682
+ try:
683
+ # Extract content from segments
684
+ if isinstance(segments, list):
685
+ content = ' '.join([segment.get('Text', '') for segment in segments if 'Text' in segment])
686
+ elif isinstance(segments, dict):
687
+ content = segments.get('text', '') or segments.get('content', '')
688
+ else:
689
+ content = str(segments)
690
+
691
+ logging.debug(f"Extracted content (first 500 chars): {content[:500]}")
692
+
693
+ # Set default custom prompt if not provided
694
+ if custom_prompt_input is None:
695
+ custom_prompt_input = """
696
+ You are a bulleted notes specialist. ```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.
697
+ **Bulleted Note Creation Guidelines**
698
+
699
+ **Headings**:
700
+ - Based on referenced topics, not categories like quotes or terms
701
+ - Surrounded by **bold** formatting
702
+ - Not listed as bullet points
703
+ - No space between headings and list items underneath
704
+
705
+ **Emphasis**:
706
+ - **Important terms** set in bold font
707
+ - **Text ending in a colon**: also bolded
708
+
709
+ **Review**:
710
+ - Ensure adherence to specified format
711
+ - Do not reference these instructions in your response.</s>[INST] {{ .Prompt }} [/INST]"""
712
+
713
+ logging.info(f"Adding media to database: URL={url}, Title={info_dict.get('title', 'Untitled')}, Type={media_type}")
714
+
715
+ result = add_media_with_keywords(
716
+ url=url,
717
+ title=info_dict.get('title', 'Untitled'),
718
+ media_type=media_type,
719
+ content=content,
720
+ keywords=','.join(keywords) if isinstance(keywords, list) else keywords,
721
+ prompt=custom_prompt_input or 'No prompt provided',
722
+ summary=summary or 'No summary provided',
723
+ transcription_model=whisper_model,
724
+ author=info_dict.get('uploader', 'Unknown'),
725
+ ingestion_date=datetime.now().strftime('%Y-%m-%d')
726
+ )
727
+
728
+ logging.info(f"Media added successfully: {result}")
729
+ return result
730
+
731
+ except Exception as e:
732
+ logging.error(f"Error in add_media_to_database: {str(e)}")
733
+ raise
734
+
735
+
736
+ #
737
+ #
738
+ #######################################################################################################################
739
+
740
+
741
+
742
+
743
+ #######################################################################################################################
744
+ # Functions to manage prompts DB
745
+ #
746
+
747
+ def create_prompts_db():
748
+ conn = sqlite3.connect('prompts.db')
749
+ cursor = conn.cursor()
750
+ cursor.execute('''
751
+ CREATE TABLE IF NOT EXISTS Prompts (
752
+ id INTEGER PRIMARY KEY AUTOINCREMENT,
753
+ name TEXT NOT NULL UNIQUE,
754
+ details TEXT,
755
+ system TEXT,
756
+ user TEXT
757
+ )
758
+ ''')
759
+ conn.commit()
760
+ conn.close()
761
+
762
+ create_prompts_db()
763
+
764
+
765
+ def add_prompt(name, details, system, user=None):
766
+ try:
767
+ conn = sqlite3.connect('prompts.db')
768
+ cursor = conn.cursor()
769
+ cursor.execute('''
770
+ INSERT INTO Prompts (name, details, system, user)
771
+ VALUES (?, ?, ?, ?)
772
+ ''', (name, details, system, user))
773
+ conn.commit()
774
+ conn.close()
775
+ return "Prompt added successfully."
776
+ except sqlite3.IntegrityError:
777
+ return "Prompt with this name already exists."
778
+ except sqlite3.Error as e:
779
+ return f"Database error: {e}"
780
+
781
+ def fetch_prompt_details(name):
782
+ conn = sqlite3.connect('prompts.db')
783
+ cursor = conn.cursor()
784
+ cursor.execute('''
785
+ SELECT name, details, system, user
786
+ FROM Prompts
787
+ WHERE name = ?
788
+ ''', (name,))
789
+ result = cursor.fetchone()
790
+ conn.close()
791
+ return result
792
+
793
+ def list_prompts():
794
+ conn = sqlite3.connect('prompts.db')
795
+ cursor = conn.cursor()
796
+ cursor.execute('''
797
+ SELECT name
798
+ FROM Prompts
799
+ ''')
800
+ results = cursor.fetchall()
801
+ conn.close()
802
+ return [row[0] for row in results]
803
+
804
+ def insert_prompt_to_db(title, description, system_prompt, user_prompt):
805
+ result = add_prompt(title, description, system_prompt, user_prompt)
806
+ return result
807
+
808
+
809
+
810
+
811
+ #
812
+ #
813
+ #######################################################################################################################
814
+
815
+
816
+ def update_media_content(selected_item, item_mapping, content_input, prompt_input, summary_input):
817
+ try:
818
+ if selected_item and item_mapping and selected_item in item_mapping:
819
+ media_id = item_mapping[selected_item]
820
+
821
+ with db.get_connection() as conn:
822
+ cursor = conn.cursor()
823
+
824
+ # Update the main content in the Media table
825
+ cursor.execute("UPDATE Media SET content = ? WHERE id = ?", (content_input, media_id))
826
+
827
+ # Check if a row already exists in MediaModifications for this media_id
828
+ cursor.execute("SELECT COUNT(*) FROM MediaModifications WHERE media_id = ?", (media_id,))
829
+ exists = cursor.fetchone()[0] > 0
830
+
831
+ if exists:
832
+ # Update existing row
833
+ cursor.execute("""
834
+ UPDATE MediaModifications
835
+ SET prompt = ?, summary = ?, modification_date = CURRENT_TIMESTAMP
836
+ WHERE media_id = ?
837
+ """, (prompt_input, summary_input, media_id))
838
+ else:
839
+ # Insert new row
840
+ cursor.execute("""
841
+ INSERT INTO MediaModifications (media_id, prompt, summary, modification_date)
842
+ VALUES (?, ?, ?, CURRENT_TIMESTAMP)
843
+ """, (media_id, prompt_input, summary_input))
844
+
845
+ conn.commit()
846
+
847
+ return f"Content updated successfully for media ID: {media_id}"
848
+ else:
849
+ return "No item selected or invalid selection"
850
+ except Exception as e:
851
+ logging.error(f"Error updating media content: {e}")
852
+ return f"Error updating content: {str(e)}"
853
+
854
+ def search_media_database(query: str) -> List[Tuple[int, str, str]]:
855
+ try:
856
+ with db.get_connection() as conn:
857
+ cursor = conn.cursor()
858
+ cursor.execute("SELECT id, title, url FROM Media WHERE title LIKE ?", (f'%{query}%',))
859
+ results = cursor.fetchall()
860
+ return results
861
+ except sqlite3.Error as e:
862
+ raise Exception(f"Error searching media database: {e}")
863
+
864
+ def load_media_content(media_id: int) -> dict:
865
+ try:
866
+ with db.get_connection() as conn:
867
+ cursor = conn.cursor()
868
+ cursor.execute("SELECT content, prompt, summary FROM Media WHERE id = ?", (media_id,))
869
+ result = cursor.fetchone()
870
+ if result:
871
+ return {
872
+ "content": result[0],
873
+ "prompt": result[1],
874
+ "summary": result[2]
875
+ }
876
+ return {"content": "", "prompt": "", "summary": ""}
877
+ except sqlite3.Error as e:
878
+ raise Exception(f"Error loading media content: {e}")
879
+
880
+ def insert_prompt_to_db(title, description, system_prompt, user_prompt):
881
+ try:
882
+ conn = sqlite3.connect('prompts.db')
883
+ cursor = conn.cursor()
884
+ cursor.execute(
885
+ "INSERT INTO Prompts (name, details, system, user) VALUES (?, ?, ?, ?)",
886
+ (title, description, system_prompt, user_prompt)
887
+ )
888
+ conn.commit()
889
+ conn.close()
890
+ return "Prompt added successfully!"
891
+ except sqlite3.Error as e:
892
+ return f"Error adding prompt: {e}"
893
+
894
+
895
+ def fetch_items_by_title_or_url(search_query: str, search_type: str):
896
+ try:
897
+ with db.get_connection() as conn:
898
+ cursor = conn.cursor()
899
+ if search_type == 'Title':
900
+ cursor.execute("SELECT id, title, url FROM Media WHERE title LIKE ?", (f'%{search_query}%',))
901
+ elif search_type == 'URL':
902
+ cursor.execute("SELECT id, title, url FROM Media WHERE url LIKE ?", (f'%{search_query}%',))
903
+ results = cursor.fetchall()
904
+ return results
905
+ except sqlite3.Error as e:
906
+ raise DatabaseError(f"Error fetching items by {search_type}: {e}")
907
+
908
+
909
+ def fetch_items_by_keyword(search_query: str):
910
+ try:
911
+ with db.get_connection() as conn:
912
+ cursor = conn.cursor()
913
+ cursor.execute("""
914
+ SELECT m.id, m.title, m.url
915
+ FROM Media m
916
+ JOIN MediaKeywords mk ON m.id = mk.media_id
917
+ JOIN Keywords k ON mk.keyword_id = k.id
918
+ WHERE k.keyword LIKE ?
919
+ """, (f'%{search_query}%',))
920
+ results = cursor.fetchall()
921
+ return results
922
+ except sqlite3.Error as e:
923
+ raise DatabaseError(f"Error fetching items by keyword: {e}")
924
+
925
+
926
+ def fetch_items_by_content(search_query: str):
927
+ try:
928
+ with db.get_connection() as conn:
929
+ cursor = conn.cursor()
930
+ cursor.execute("SELECT id, title, url FROM Media WHERE content LIKE ?", (f'%{search_query}%',))
931
+ results = cursor.fetchall()
932
+ return results
933
+ except sqlite3.Error as e:
934
+ raise DatabaseError(f"Error fetching items by content: {e}")
935
+
936
+
937
+ def fetch_item_details_single(media_id: int):
938
+ try:
939
+ with db.get_connection() as conn:
940
+ cursor = conn.cursor()
941
+ cursor.execute("""
942
+ SELECT prompt, summary
943
+ FROM MediaModifications
944
+ WHERE media_id = ?
945
+ ORDER BY modification_date DESC
946
+ LIMIT 1
947
+ """, (media_id,))
948
+ prompt_summary_result = cursor.fetchone()
949
+ cursor.execute("SELECT content FROM Media WHERE id = ?", (media_id,))
950
+ content_result = cursor.fetchone()
951
+
952
+ prompt = prompt_summary_result[0] if prompt_summary_result else ""
953
+ summary = prompt_summary_result[1] if prompt_summary_result else ""
954
+ content = content_result[0] if content_result else ""
955
+
956
+ return prompt, summary, content
957
+ except sqlite3.Error as e:
958
+ raise Exception(f"Error fetching item details: {e}")
959
+
960
+
961
+
962
+ def convert_to_markdown(item):
963
+ markdown = f"# {item['title']}\n\n"
964
+ markdown += f"**URL:** {item['url']}\n\n"
965
+ markdown += f"**Author:** {item['author']}\n\n"
966
+ markdown += f"**Ingestion Date:** {item['ingestion_date']}\n\n"
967
+ markdown += f"**Type:** {item['type']}\n\n"
968
+ markdown += f"**Keywords:** {', '.join(item['keywords'])}\n\n"
969
+ markdown += "## Summary\n\n"
970
+ markdown += f"{item['summary']}\n\n"
971
+ markdown += "## Content\n\n"
972
+ markdown += f"{item['content']}\n\n"
973
+ return markdown
App_Function_Libraries/Summarization_General_Lib.py ADDED
@@ -0,0 +1,1388 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Summarization_General_Lib.py
2
+ #########################################
3
+ # General Summarization Library
4
+ # This library is used to perform summarization.
5
+ #
6
+ ####
7
+ import configparser
8
+ ####################
9
+ # Function List
10
+ #
11
+ # 1. extract_text_from_segments(segments: List[Dict]) -> str
12
+ # 2. summarize_with_openai(api_key, file_path, custom_prompt_arg)
13
+ # 3. summarize_with_anthropic(api_key, file_path, model, custom_prompt_arg, max_retries=3, retry_delay=5)
14
+ # 4. summarize_with_cohere(api_key, file_path, model, custom_prompt_arg)
15
+ # 5. summarize_with_groq(api_key, file_path, model, custom_prompt_arg)
16
+ #
17
+ #
18
+ ####################
19
+ # Import necessary libraries
20
+ import os
21
+ import logging
22
+ import time
23
+ import requests
24
+ import json
25
+ from requests import RequestException
26
+
27
+ from App_Function_Libraries.Audio_Transcription_Lib import convert_to_wav, speech_to_text
28
+ from App_Function_Libraries.Chunk_Lib import semantic_chunking, rolling_summarize, recursive_summarize_chunks, \
29
+ improved_chunking_process
30
+ from App_Function_Libraries.Diarization_Lib import combine_transcription_and_diarization
31
+ from App_Function_Libraries.Local_Summarization_Lib import summarize_with_llama, summarize_with_kobold, \
32
+ summarize_with_oobabooga, summarize_with_tabbyapi, summarize_with_vllm, summarize_with_local_llm
33
+ from App_Function_Libraries.SQLite_DB import is_valid_url, add_media_to_database
34
+ # Import Local
35
+ from App_Function_Libraries.Utils import load_and_log_configs, load_comprehensive_config, sanitize_filename, \
36
+ clean_youtube_url, extract_video_info, create_download_directory
37
+ from App_Function_Libraries.Video_DL_Ingestion_Lib import download_video
38
+
39
+ #
40
+ #######################################################################################################################
41
+ # Function Definitions
42
+ #
43
+ config = load_comprehensive_config()
44
+ openai_api_key = config.get('API', 'openai_api_key', fallback=None)
45
+
46
+ def extract_text_from_segments(segments):
47
+ logging.debug(f"Segments received: {segments}")
48
+ logging.debug(f"Type of segments: {type(segments)}")
49
+
50
+ text = ""
51
+
52
+ if isinstance(segments, list):
53
+ for segment in segments:
54
+ logging.debug(f"Current segment: {segment}")
55
+ logging.debug(f"Type of segment: {type(segment)}")
56
+ if 'Text' in segment:
57
+ text += segment['Text'] + " "
58
+ else:
59
+ logging.warning(f"Skipping segment due to missing 'Text' key: {segment}")
60
+ else:
61
+ logging.warning(f"Unexpected type of 'segments': {type(segments)}")
62
+
63
+ return text.strip()
64
+
65
+
66
+ def summarize_with_openai(api_key, input_data, custom_prompt_arg):
67
+ loaded_config_data = load_and_log_configs()
68
+ try:
69
+ # API key validation
70
+ if api_key is None or api_key.strip() == "":
71
+ logging.info("OpenAI: API key not provided as parameter")
72
+ logging.info("OpenAI: Attempting to use API key from config file")
73
+ api_key = loaded_config_data['api_keys']['openai']
74
+
75
+ if api_key is None or api_key.strip() == "":
76
+ logging.error("OpenAI: API key not found or is empty")
77
+ return "OpenAI: API Key Not Provided/Found in Config file or is empty"
78
+
79
+ logging.debug(f"OpenAI: Using API Key: {api_key[:5]}...{api_key[-5:]}")
80
+
81
+ # Input data handling
82
+ logging.debug(f"OpenAI: Raw input data type: {type(input_data)}")
83
+ logging.debug(f"OpenAI: Raw input data (first 500 chars): {str(input_data)[:500]}...")
84
+
85
+ if isinstance(input_data, str):
86
+ if input_data.strip().startswith('{'):
87
+ # It's likely a JSON string
88
+ logging.debug("OpenAI: Parsing provided JSON string data for summarization")
89
+ try:
90
+ data = json.loads(input_data)
91
+ except json.JSONDecodeError as e:
92
+ logging.error(f"OpenAI: Error parsing JSON string: {str(e)}")
93
+ return f"OpenAI: Error parsing JSON input: {str(e)}"
94
+ elif os.path.isfile(input_data):
95
+ logging.debug("OpenAI: Loading JSON data from file for summarization")
96
+ with open(input_data, 'r') as file:
97
+ data = json.load(file)
98
+ else:
99
+ logging.debug("OpenAI: Using provided string data for summarization")
100
+ data = input_data
101
+ else:
102
+ data = input_data
103
+
104
+ logging.debug(f"OpenAI: Processed data type: {type(data)}")
105
+ logging.debug(f"OpenAI: Processed data (first 500 chars): {str(data)[:500]}...")
106
+
107
+ # Text extraction
108
+ if isinstance(data, dict):
109
+ if 'summary' in data:
110
+ logging.debug("OpenAI: Summary already exists in the loaded data")
111
+ return data['summary']
112
+ elif 'segments' in data:
113
+ text = extract_text_from_segments(data['segments'])
114
+ else:
115
+ text = json.dumps(data) # Convert dict to string if no specific format
116
+ elif isinstance(data, list):
117
+ text = extract_text_from_segments(data)
118
+ elif isinstance(data, str):
119
+ text = data
120
+ else:
121
+ raise ValueError(f"OpenAI: Invalid input data format: {type(data)}")
122
+
123
+ openai_model = loaded_config_data['models']['openai'] or "gpt-4o"
124
+ logging.debug(f"OpenAI: Extracted text (first 500 chars): {text[:500]}...")
125
+ logging.debug(f"OpenAI: Custom prompt: {custom_prompt_arg}")
126
+
127
+ openai_model = loaded_config_data['models']['openai'] or "gpt-4o"
128
+ logging.debug(f"OpenAI: Using model: {openai_model}")
129
+
130
+ headers = {
131
+ 'Authorization': f'Bearer {openai_api_key}',
132
+ 'Content-Type': 'application/json'
133
+ }
134
+
135
+ logging.debug(
136
+ f"OpenAI API Key: {openai_api_key[:5]}...{openai_api_key[-5:] if openai_api_key else None}")
137
+ logging.debug("openai: Preparing data + prompt for submittal")
138
+ openai_prompt = f"{text} \n\n\n\n{custom_prompt_arg}"
139
+ data = {
140
+ "model": openai_model,
141
+ "messages": [
142
+ {"role": "system", "content": "You are a professional summarizer."},
143
+ {"role": "user", "content": openai_prompt}
144
+ ],
145
+ "max_tokens": 4096,
146
+ "temperature": 0.1
147
+ }
148
+
149
+ logging.debug("OpenAI: Posting request")
150
+ response = requests.post('https://api.openai.com/v1/chat/completions', headers=headers, json=data)
151
+
152
+ if response.status_code == 200:
153
+ response_data = response.json()
154
+ if 'choices' in response_data and len(response_data['choices']) > 0:
155
+ summary = response_data['choices'][0]['message']['content'].strip()
156
+ logging.debug("OpenAI: Summarization successful")
157
+ logging.debug(f"OpenAI: Summary (first 500 chars): {summary[:500]}...")
158
+ return summary
159
+ else:
160
+ logging.warning("OpenAI: Summary not found in the response data")
161
+ return "OpenAI: Summary not available"
162
+ else:
163
+ logging.error(f"OpenAI: Summarization failed with status code {response.status_code}")
164
+ logging.error(f"OpenAI: Error response: {response.text}")
165
+ return f"OpenAI: Failed to process summary. Status code: {response.status_code}"
166
+ except json.JSONDecodeError as e:
167
+ logging.error(f"OpenAI: Error decoding JSON: {str(e)}", exc_info=True)
168
+ return f"OpenAI: Error decoding JSON input: {str(e)}"
169
+ except requests.RequestException as e:
170
+ logging.error(f"OpenAI: Error making API request: {str(e)}", exc_info=True)
171
+ return f"OpenAI: Error making API request: {str(e)}"
172
+ except Exception as e:
173
+ logging.error(f"OpenAI: Unexpected error: {str(e)}", exc_info=True)
174
+ return f"OpenAI: Unexpected error occurred: {str(e)}"
175
+
176
+
177
+ def summarize_with_anthropic(api_key, input_data, custom_prompt_arg, max_retries=3, retry_delay=5):
178
+ try:
179
+ loaded_config_data = load_and_log_configs()
180
+ # API key validation
181
+ if api_key is None or api_key.strip() == "":
182
+ logging.info("Anthropic: API key not provided as parameter")
183
+ logging.info("Anthropic: Attempting to use API key from config file")
184
+ anthropic_api_key = loaded_config_data['api_keys']['anthropic']
185
+
186
+ # Sanity check to ensure API key is not empty in the config file
187
+ if api_key is None or api_key.strip() == "":
188
+ logging.error("Anthropic: API key not found or is empty")
189
+ return "Anthropic: API Key Not Provided/Found in Config file or is empty"
190
+
191
+ logging.debug(f"Anthropic: Using API Key: {api_key[:5]}...{api_key[-5:]}")
192
+
193
+ if isinstance(input_data, str) and os.path.isfile(input_data):
194
+ logging.debug("AnthropicAI: Loading json data for summarization")
195
+ with open(input_data, 'r') as file:
196
+ data = json.load(file)
197
+ else:
198
+ logging.debug("AnthropicAI: Using provided string data for summarization")
199
+ data = input_data
200
+
201
+ logging.debug(f"AnthropicAI: Loaded data: {data}")
202
+ logging.debug(f"AnthropicAI: Type of data: {type(data)}")
203
+
204
+ if isinstance(data, dict) and 'summary' in data:
205
+ # If the loaded data is a dictionary and already contains a summary, return it
206
+ logging.debug("Anthropic: Summary already exists in the loaded data")
207
+ return data['summary']
208
+
209
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
210
+ if isinstance(data, list):
211
+ segments = data
212
+ text = extract_text_from_segments(segments)
213
+ elif isinstance(data, str):
214
+ text = data
215
+ else:
216
+ raise ValueError("Anthropic: Invalid input data format")
217
+
218
+ anthropic_model = loaded_config_data['models']['anthropic']
219
+
220
+ headers = {
221
+ 'x-api-key': anthropic_api_key,
222
+ 'anthropic-version': '2023-06-01',
223
+ 'Content-Type': 'application/json'
224
+ }
225
+
226
+ anthropic_prompt = custom_prompt_arg
227
+ logging.debug(f"Anthropic: Prompt is {anthropic_prompt}")
228
+ user_message = {
229
+ "role": "user",
230
+ "content": f"{text} \n\n\n\n{anthropic_prompt}"
231
+ }
232
+
233
+ model = loaded_config_data['models']['anthropic']
234
+
235
+ data = {
236
+ "model": model,
237
+ "max_tokens": 4096, # max _possible_ tokens to return
238
+ "messages": [user_message],
239
+ "stop_sequences": ["\n\nHuman:"],
240
+ "temperature": 0.1,
241
+ "top_k": 0,
242
+ "top_p": 1.0,
243
+ "metadata": {
244
+ "user_id": "example_user_id",
245
+ },
246
+ "stream": False,
247
+ "system": "You are a professional summarizer."
248
+ }
249
+
250
+ for attempt in range(max_retries):
251
+ try:
252
+ logging.debug("anthropic: Posting request to API")
253
+ response = requests.post('https://api.anthropic.com/v1/messages', headers=headers, json=data)
254
+
255
+ # Check if the status code indicates success
256
+ if response.status_code == 200:
257
+ logging.debug("anthropic: Post submittal successful")
258
+ response_data = response.json()
259
+ try:
260
+ summary = response_data['content'][0]['text'].strip()
261
+ logging.debug("anthropic: Summarization successful")
262
+ print("Summary processed successfully.")
263
+ return summary
264
+ except (IndexError, KeyError) as e:
265
+ logging.debug("anthropic: Unexpected data in response")
266
+ print("Unexpected response format from Anthropic API:", response.text)
267
+ return None
268
+ elif response.status_code == 500: # Handle internal server error specifically
269
+ logging.debug("anthropic: Internal server error")
270
+ print("Internal server error from API. Retrying may be necessary.")
271
+ time.sleep(retry_delay)
272
+ else:
273
+ logging.debug(
274
+ f"anthropic: Failed to summarize, status code {response.status_code}: {response.text}")
275
+ print(f"Failed to process summary, status code {response.status_code}: {response.text}")
276
+ return None
277
+
278
+ except RequestException as e:
279
+ logging.error(f"anthropic: Network error during attempt {attempt + 1}/{max_retries}: {str(e)}")
280
+ if attempt < max_retries - 1:
281
+ time.sleep(retry_delay)
282
+ else:
283
+ return f"anthropic: Network error: {str(e)}"
284
+ except FileNotFoundError as e:
285
+ logging.error(f"anthropic: File not found: {input_data}")
286
+ return f"anthropic: File not found: {input_data}"
287
+ except json.JSONDecodeError as e:
288
+ logging.error(f"anthropic: Invalid JSON format in file: {input_data}")
289
+ return f"anthropic: Invalid JSON format in file: {input_data}"
290
+ except Exception as e:
291
+ logging.error(f"anthropic: Error in processing: {str(e)}")
292
+ return f"anthropic: Error occurred while processing summary with Anthropic: {str(e)}"
293
+
294
+
295
+ # Summarize with Cohere
296
+ def summarize_with_cohere(api_key, input_data, custom_prompt_arg):
297
+ loaded_config_data = load_and_log_configs()
298
+ try:
299
+ # API key validation
300
+ if api_key is None or api_key.strip() == "":
301
+ logging.info("Cohere: API key not provided as parameter")
302
+ logging.info("Cohere: Attempting to use API key from config file")
303
+ cohere_api_key = loaded_config_data['api_keys']['cohere']
304
+
305
+ if api_key is None or api_key.strip() == "":
306
+ logging.error("Cohere: API key not found or is empty")
307
+ return "Cohere: API Key Not Provided/Found in Config file or is empty"
308
+
309
+ logging.debug(f"Cohere: Using API Key: {api_key[:5]}...{api_key[-5:]}")
310
+
311
+ if isinstance(input_data, str) and os.path.isfile(input_data):
312
+ logging.debug("Cohere: Loading json data for summarization")
313
+ with open(input_data, 'r') as file:
314
+ data = json.load(file)
315
+ else:
316
+ logging.debug("Cohere: Using provided string data for summarization")
317
+ data = input_data
318
+
319
+ logging.debug(f"Cohere: Loaded data: {data}")
320
+ logging.debug(f"Cohere: Type of data: {type(data)}")
321
+
322
+ if isinstance(data, dict) and 'summary' in data:
323
+ # If the loaded data is a dictionary and already contains a summary, return it
324
+ logging.debug("Cohere: Summary already exists in the loaded data")
325
+ return data['summary']
326
+
327
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
328
+ if isinstance(data, list):
329
+ segments = data
330
+ text = extract_text_from_segments(segments)
331
+ elif isinstance(data, str):
332
+ text = data
333
+ else:
334
+ raise ValueError("Invalid input data format")
335
+
336
+ cohere_model = loaded_config_data['models']['cohere']
337
+
338
+ headers = {
339
+ 'accept': 'application/json',
340
+ 'content-type': 'application/json',
341
+ 'Authorization': f'Bearer {cohere_api_key}'
342
+ }
343
+
344
+ cohere_prompt = f"{text} \n\n\n\n{custom_prompt_arg}"
345
+ logging.debug("cohere: Prompt being sent is {cohere_prompt}")
346
+
347
+ model = loaded_config_data['models']['anthropic']
348
+
349
+ data = {
350
+ "chat_history": [
351
+ {"role": "USER", "message": cohere_prompt}
352
+ ],
353
+ "message": "Please provide a summary.",
354
+ "model": model,
355
+ "connectors": [{"id": "web-search"}]
356
+ }
357
+
358
+ logging.debug("cohere: Submitting request to API endpoint")
359
+ print("cohere: Submitting request to API endpoint")
360
+ response = requests.post('https://api.cohere.ai/v1/chat', headers=headers, json=data)
361
+ response_data = response.json()
362
+ logging.debug("API Response Data: %s", response_data)
363
+
364
+ if response.status_code == 200:
365
+ if 'text' in response_data:
366
+ summary = response_data['text'].strip()
367
+ logging.debug("cohere: Summarization successful")
368
+ print("Summary processed successfully.")
369
+ return summary
370
+ else:
371
+ logging.error("Expected data not found in API response.")
372
+ return "Expected data not found in API response."
373
+ else:
374
+ logging.error(f"cohere: API request failed with status code {response.status_code}: {response.text}")
375
+ print(f"Failed to process summary, status code {response.status_code}: {response.text}")
376
+ return f"cohere: API request failed: {response.text}"
377
+
378
+ except Exception as e:
379
+ logging.error("cohere: Error in processing: %s", str(e))
380
+ return f"cohere: Error occurred while processing summary with Cohere: {str(e)}"
381
+
382
+
383
+ # https://console.groq.com/docs/quickstart
384
+ def summarize_with_groq(api_key, input_data, custom_prompt_arg):
385
+ loaded_config_data = load_and_log_configs()
386
+ try:
387
+ # API key validation
388
+ if api_key is None or api_key.strip() == "":
389
+ logging.info("Groq: API key not provided as parameter")
390
+ logging.info("Groq: Attempting to use API key from config file")
391
+ api_key = loaded_config_data['api_keys']['groq']
392
+
393
+ if api_key is None or api_key.strip() == "":
394
+ logging.error("Groq: API key not found or is empty")
395
+ return "Groq: API Key Not Provided/Found in Config file or is empty"
396
+
397
+ logging.debug(f"Groq: Using API Key: {api_key[:5]}...{api_key[-5:]}")
398
+
399
+ # Transcript data handling & Validation
400
+ if isinstance(input_data, str) and os.path.isfile(input_data):
401
+ logging.debug("Groq: Loading json data for summarization")
402
+ with open(input_data, 'r') as file:
403
+ data = json.load(file)
404
+ else:
405
+ logging.debug("Groq: Using provided string data for summarization")
406
+ data = input_data
407
+
408
+ logging.debug(f"Groq: Loaded data: {data}")
409
+ logging.debug(f"Groq: Type of data: {type(data)}")
410
+
411
+ if isinstance(data, dict) and 'summary' in data:
412
+ # If the loaded data is a dictionary and already contains a summary, return it
413
+ logging.debug("Groq: Summary already exists in the loaded data")
414
+ return data['summary']
415
+
416
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
417
+ if isinstance(data, list):
418
+ segments = data
419
+ text = extract_text_from_segments(segments)
420
+ elif isinstance(data, str):
421
+ text = data
422
+ else:
423
+ raise ValueError("Groq: Invalid input data format")
424
+
425
+ # Set the model to be used
426
+ groq_model = loaded_config_data['models']['groq']
427
+
428
+ headers = {
429
+ 'Authorization': f'Bearer {api_key}',
430
+ 'Content-Type': 'application/json'
431
+ }
432
+
433
+ groq_prompt = f"{text} \n\n\n\n{custom_prompt_arg}"
434
+ logging.debug("groq: Prompt being sent is {groq_prompt}")
435
+
436
+ data = {
437
+ "messages": [
438
+ {
439
+ "role": "user",
440
+ "content": groq_prompt
441
+ }
442
+ ],
443
+ "model": groq_model
444
+ }
445
+
446
+ logging.debug("groq: Submitting request to API endpoint")
447
+ print("groq: Submitting request to API endpoint")
448
+ response = requests.post('https://api.groq.com/openai/v1/chat/completions', headers=headers, json=data)
449
+
450
+ response_data = response.json()
451
+ logging.debug("API Response Data: %s", response_data)
452
+
453
+ if response.status_code == 200:
454
+ if 'choices' in response_data and len(response_data['choices']) > 0:
455
+ summary = response_data['choices'][0]['message']['content'].strip()
456
+ logging.debug("groq: Summarization successful")
457
+ print("Summarization successful.")
458
+ return summary
459
+ else:
460
+ logging.error("Expected data not found in API response.")
461
+ return "Expected data not found in API response."
462
+ else:
463
+ logging.error(f"groq: API request failed with status code {response.status_code}: {response.text}")
464
+ return f"groq: API request failed: {response.text}"
465
+
466
+ except Exception as e:
467
+ logging.error("groq: Error in processing: %s", str(e))
468
+ return f"groq: Error occurred while processing summary with groq: {str(e)}"
469
+
470
+
471
+ def summarize_with_openrouter(api_key, input_data, custom_prompt_arg):
472
+ loaded_config_data = load_and_log_configs()
473
+ import requests
474
+ import json
475
+ global openrouter_model, openrouter_api_key
476
+ # API key validation
477
+ if api_key is None or api_key.strip() == "":
478
+ logging.info("OpenRouter: API key not provided as parameter")
479
+ logging.info("OpenRouter: Attempting to use API key from config file")
480
+ openrouter_api_key = loaded_config_data['api_keys']['openrouter']
481
+
482
+ if api_key is None or api_key.strip() == "":
483
+ logging.error("OpenRouter: API key not found or is empty")
484
+ return "OpenRouter: API Key Not Provided/Found in Config file or is empty"
485
+
486
+ # Model Selection validation
487
+ if openrouter_model is None or openrouter_model.strip() == "":
488
+ logging.info("OpenRouter: model not provided as parameter")
489
+ logging.info("OpenRouter: Attempting to use model from config file")
490
+ openrouter_model = loaded_config_data['api_keys']['openrouter_model']
491
+
492
+ if api_key is None or api_key.strip() == "":
493
+ logging.error("OpenAI: API key not found or is empty")
494
+ return "OpenAI: API Key Not Provided/Found in Config file or is empty"
495
+
496
+ logging.debug(f"OpenAI: Using API Key: {api_key[:5]}...{api_key[-5:]}")
497
+
498
+ logging.debug(f"openai: Using API Key: {api_key[:5]}...{api_key[-5:]}")
499
+
500
+ if isinstance(input_data, str) and os.path.isfile(input_data):
501
+ logging.debug("openrouter: Loading json data for summarization")
502
+ with open(input_data, 'r') as file:
503
+ data = json.load(file)
504
+ else:
505
+ logging.debug("openrouter: Using provided string data for summarization")
506
+ data = input_data
507
+
508
+ logging.debug(f"openrouter: Loaded data: {data}")
509
+ logging.debug(f"openrouter: Type of data: {type(data)}")
510
+
511
+ if isinstance(data, dict) and 'summary' in data:
512
+ # If the loaded data is a dictionary and already contains a summary, return it
513
+ logging.debug("openrouter: Summary already exists in the loaded data")
514
+ return data['summary']
515
+
516
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
517
+ if isinstance(data, list):
518
+ segments = data
519
+ text = extract_text_from_segments(segments)
520
+ elif isinstance(data, str):
521
+ text = data
522
+ else:
523
+ raise ValueError("Invalid input data format")
524
+
525
+ config = configparser.ConfigParser()
526
+ file_path = 'config.txt'
527
+
528
+ # Check if the file exists in the specified path
529
+ if os.path.exists(file_path):
530
+ config.read(file_path)
531
+ elif os.path.exists('config.txt'): # Check in the current directory
532
+ config.read('../config.txt')
533
+ else:
534
+ print("config.txt not found in the specified path or current directory.")
535
+
536
+ openrouter_prompt = f"{input_data} \n\n\n\n{custom_prompt_arg}"
537
+
538
+ try:
539
+ logging.debug("openrouter: Submitting request to API endpoint")
540
+ print("openrouter: Submitting request to API endpoint")
541
+ response = requests.post(
542
+ url="https://openrouter.ai/api/v1/chat/completions",
543
+ headers={
544
+ "Authorization": f"Bearer {openrouter_api_key}",
545
+ },
546
+ data=json.dumps({
547
+ "model": f"{openrouter_model}",
548
+ "messages": [
549
+ {"role": "user", "content": openrouter_prompt}
550
+ ]
551
+ })
552
+ )
553
+
554
+ response_data = response.json()
555
+ logging.debug("API Response Data: %s", response_data)
556
+
557
+ if response.status_code == 200:
558
+ if 'choices' in response_data and len(response_data['choices']) > 0:
559
+ summary = response_data['choices'][0]['message']['content'].strip()
560
+ logging.debug("openrouter: Summarization successful")
561
+ print("openrouter: Summarization successful.")
562
+ return summary
563
+ else:
564
+ logging.error("openrouter: Expected data not found in API response.")
565
+ return "openrouter: Expected data not found in API response."
566
+ else:
567
+ logging.error(f"openrouter: API request failed with status code {response.status_code}: {response.text}")
568
+ return f"openrouter: API request failed: {response.text}"
569
+ except Exception as e:
570
+ logging.error("openrouter: Error in processing: %s", str(e))
571
+ return f"openrouter: Error occurred while processing summary with openrouter: {str(e)}"
572
+
573
+ def summarize_with_huggingface(api_key, input_data, custom_prompt_arg):
574
+ loaded_config_data = load_and_log_configs()
575
+ global huggingface_api_key
576
+ logging.debug(f"huggingface: Summarization process starting...")
577
+ try:
578
+ # API key validation
579
+ if api_key is None or api_key.strip() == "":
580
+ logging.info("HuggingFace: API key not provided as parameter")
581
+ logging.info("HuggingFace: Attempting to use API key from config file")
582
+ api_key = loaded_config_data['api_keys']['huggingface']
583
+
584
+ if api_key is None or api_key.strip() == "":
585
+ logging.error("HuggingFace: API key not found or is empty")
586
+ return "HuggingFace: API Key Not Provided/Found in Config file or is empty"
587
+
588
+ logging.debug(f"HuggingFace: Using API Key: {api_key[:5]}...{api_key[-5:]}")
589
+
590
+ if isinstance(input_data, str) and os.path.isfile(input_data):
591
+ logging.debug("HuggingFace: Loading json data for summarization")
592
+ with open(input_data, 'r') as file:
593
+ data = json.load(file)
594
+ else:
595
+ logging.debug("HuggingFace: Using provided string data for summarization")
596
+ data = input_data
597
+
598
+ logging.debug(f"HuggingFace: Loaded data: {data}")
599
+ logging.debug(f"HuggingFace: Type of data: {type(data)}")
600
+
601
+ if isinstance(data, dict) and 'summary' in data:
602
+ # If the loaded data is a dictionary and already contains a summary, return it
603
+ logging.debug("HuggingFace: Summary already exists in the loaded data")
604
+ return data['summary']
605
+
606
+ # If the loaded data is a list of segment dictionaries or a string, proceed with summarization
607
+ if isinstance(data, list):
608
+ segments = data
609
+ text = extract_text_from_segments(segments)
610
+ elif isinstance(data, str):
611
+ text = data
612
+ else:
613
+ raise ValueError("HuggingFace: Invalid input data format")
614
+
615
+ print(f"HuggingFace: lets make sure the HF api key exists...\n\t {api_key}")
616
+ headers = {
617
+ "Authorization": f"Bearer {api_key}"
618
+ }
619
+
620
+ huggingface_model = loaded_config_data['models']['huggingface']
621
+ API_URL = f"https://api-inference.huggingface.co/models/{huggingface_model}"
622
+
623
+ huggingface_prompt = f"{text}\n\n\n\n{custom_prompt_arg}"
624
+ logging.debug("huggingface: Prompt being sent is {huggingface_prompt}")
625
+ data = {
626
+ "inputs": text,
627
+ "parameters": {"max_length": 512, "min_length": 100} # You can adjust max_length and min_length as needed
628
+ }
629
+
630
+ print(f"huggingface: lets make sure the HF api key is the same..\n\t {huggingface_api_key}")
631
+
632
+ logging.debug("huggingface: Submitting request...")
633
+
634
+ response = requests.post(API_URL, headers=headers, json=data)
635
+
636
+ if response.status_code == 200:
637
+ summary = response.json()[0]['summary_text']
638
+ logging.debug("huggingface: Summarization successful")
639
+ print("Summarization successful.")
640
+ return summary
641
+ else:
642
+ logging.error(f"huggingface: Summarization failed with status code {response.status_code}: {response.text}")
643
+ return f"Failed to process summary, status code {response.status_code}: {response.text}"
644
+ except Exception as e:
645
+ logging.error("huggingface: Error in processing: %s", str(e))
646
+ print(f"Error occurred while processing summary with huggingface: {str(e)}")
647
+ return None
648
+
649
+
650
+ def summarize_with_deepseek(api_key, input_data, custom_prompt_arg):
651
+ loaded_config_data = load_and_log_configs()
652
+ try:
653
+ # API key validation
654
+ if api_key is None or api_key.strip() == "":
655
+ logging.info("DeepSeek: API key not provided as parameter")
656
+ logging.info("DeepSeek: Attempting to use API key from config file")
657
+ api_key = loaded_config_data['api_keys']['deepseek']
658
+
659
+ if api_key is None or api_key.strip() == "":
660
+ logging.error("DeepSeek: API key not found or is empty")
661
+ return "DeepSeek: API Key Not Provided/Found in Config file or is empty"
662
+
663
+ logging.debug(f"DeepSeek: Using API Key: {api_key[:5]}...{api_key[-5:]}")
664
+
665
+ # Input data handling
666
+ if isinstance(input_data, str) and os.path.isfile(input_data):
667
+ logging.debug("DeepSeek: Loading json data for summarization")
668
+ with open(input_data, 'r') as file:
669
+ data = json.load(file)
670
+ else:
671
+ logging.debug("DeepSeek: Using provided string data for summarization")
672
+ data = input_data
673
+
674
+ logging.debug(f"DeepSeek: Loaded data: {data}")
675
+ logging.debug(f"DeepSeek: Type of data: {type(data)}")
676
+
677
+ if isinstance(data, dict) and 'summary' in data:
678
+ # If the loaded data is a dictionary and already contains a summary, return it
679
+ logging.debug("DeepSeek: Summary already exists in the loaded data")
680
+ return data['summary']
681
+
682
+ # Text extraction
683
+ if isinstance(data, list):
684
+ segments = data
685
+ text = extract_text_from_segments(segments)
686
+ elif isinstance(data, str):
687
+ text = data
688
+ else:
689
+ raise ValueError("DeepSeek: Invalid input data format")
690
+
691
+ deepseek_model = loaded_config_data['models']['deepseek'] or "deepseek-chat"
692
+
693
+ headers = {
694
+ 'Authorization': f'Bearer {api_key}',
695
+ 'Content-Type': 'application/json'
696
+ }
697
+
698
+ logging.debug(
699
+ f"Deepseek API Key: {api_key[:5]}...{api_key[-5:] if api_key else None}")
700
+ logging.debug("openai: Preparing data + prompt for submittal")
701
+ deepseek_prompt = f"{text} \n\n\n\n{custom_prompt_arg}"
702
+ data = {
703
+ "model": deepseek_model,
704
+ "messages": [
705
+ {"role": "system", "content": "You are a professional summarizer."},
706
+ {"role": "user", "content": deepseek_prompt}
707
+ ],
708
+ "stream": False,
709
+ "temperature": 0.8
710
+ }
711
+
712
+ logging.debug("DeepSeek: Posting request")
713
+ response = requests.post('https://api.deepseek.com/chat/completions', headers=headers, json=data)
714
+
715
+ if response.status_code == 200:
716
+ response_data = response.json()
717
+ if 'choices' in response_data and len(response_data['choices']) > 0:
718
+ summary = response_data['choices'][0]['message']['content'].strip()
719
+ logging.debug("DeepSeek: Summarization successful")
720
+ return summary
721
+ else:
722
+ logging.warning("DeepSeek: Summary not found in the response data")
723
+ return "DeepSeek: Summary not available"
724
+ else:
725
+ logging.error(f"DeepSeek: Summarization failed with status code {response.status_code}")
726
+ logging.error(f"DeepSeek: Error response: {response.text}")
727
+ return f"DeepSeek: Failed to process summary. Status code: {response.status_code}"
728
+ except Exception as e:
729
+ logging.error(f"DeepSeek: Error in processing: {str(e)}", exc_info=True)
730
+ return f"DeepSeek: Error occurred while processing summary: {str(e)}"
731
+
732
+
733
+ #
734
+ #
735
+ #######################################################################################################################
736
+ #
737
+ #
738
+ # Gradio File Processing
739
+
740
+
741
+ # Handle multiple videos as input
742
+ def process_video_urls(url_list, num_speakers, whisper_model, custom_prompt_input, offset, api_name, api_key, vad_filter,
743
+ download_video_flag, download_audio, rolling_summarization, detail_level, question_box,
744
+ keywords, chunk_text_by_words, max_words, chunk_text_by_sentences, max_sentences,
745
+ chunk_text_by_paragraphs, max_paragraphs, chunk_text_by_tokens, max_tokens, chunk_by_semantic,
746
+ semantic_chunk_size, semantic_chunk_overlap, recursive_summarization):
747
+ global current_progress
748
+ progress = [] # This must always be a list
749
+ status = [] # This must always be a list
750
+
751
+ if custom_prompt_input is None:
752
+ custom_prompt_input = """
753
+ You are a bulleted notes specialist. ```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.
754
+ **Bulleted Note Creation Guidelines**
755
+
756
+ **Headings**:
757
+ - Based on referenced topics, not categories like quotes or terms
758
+ - Surrounded by **bold** formatting
759
+ - Not listed as bullet points
760
+ - No space between headings and list items underneath
761
+
762
+ **Emphasis**:
763
+ - **Important terms** set in bold font
764
+ - **Text ending in a colon**: also bolded
765
+
766
+ **Review**:
767
+ - Ensure adherence to specified format
768
+ - Do not reference these instructions in your response.</s>[INST] {{ .Prompt }} [/INST]"""
769
+
770
+ def update_progress(index, url, message):
771
+ progress.append(f"Processing {index + 1}/{len(url_list)}: {url}") # Append to list
772
+ status.append(message) # Append to list
773
+ return "\n".join(progress), "\n".join(status) # Return strings for display
774
+
775
+
776
+ for index, url in enumerate(url_list):
777
+ try:
778
+ transcription, summary, json_file_path, summary_file_path, _, _ = process_url(
779
+ url=url,
780
+ num_speakers=num_speakers,
781
+ whisper_model=whisper_model,
782
+ custom_prompt_input=custom_prompt_input,
783
+ offset=offset,
784
+ api_name=api_name,
785
+ api_key=api_key,
786
+ vad_filter=vad_filter,
787
+ download_video_flag=download_video_flag,
788
+ download_audio=download_audio,
789
+ rolling_summarization=rolling_summarization,
790
+ detail_level=detail_level,
791
+ question_box=question_box,
792
+ keywords=keywords,
793
+ chunk_text_by_words=chunk_text_by_words,
794
+ max_words=max_words,
795
+ chunk_text_by_sentences=chunk_text_by_sentences,
796
+ max_sentences=max_sentences,
797
+ chunk_text_by_paragraphs=chunk_text_by_paragraphs,
798
+ max_paragraphs=max_paragraphs,
799
+ chunk_text_by_tokens=chunk_text_by_tokens,
800
+ max_tokens=max_tokens,
801
+ chunk_by_semantic=chunk_by_semantic,
802
+ semantic_chunk_size=semantic_chunk_size,
803
+ semantic_chunk_overlap=semantic_chunk_overlap,
804
+ recursive_summarization=recursive_summarization
805
+ )
806
+ # Update progress and transcription properly
807
+ current_progress, current_status = update_progress(index, url, "Video processed and ingested into the database.")
808
+ except Exception as e:
809
+ current_progress, current_status = update_progress(index, url, f"Error: {str(e)}")
810
+
811
+ success_message = "All videos have been transcribed, summarized, and ingested into the database successfully."
812
+ return current_progress, success_message, None, None, None, None
813
+
814
+
815
+ # stuff
816
+ def perform_transcription(video_path, offset, whisper_model, vad_filter, diarize=False):
817
+ global segments_json_path
818
+ audio_file_path = convert_to_wav(video_path, offset)
819
+ segments_json_path = audio_file_path.replace('.wav', '.segments.json')
820
+
821
+ if diarize:
822
+ diarized_json_path = audio_file_path.replace('.wav', '.diarized.json')
823
+
824
+ # Check if diarized JSON already exists
825
+ if os.path.exists(diarized_json_path):
826
+ logging.info(f"Diarized file already exists: {diarized_json_path}")
827
+ try:
828
+ with open(diarized_json_path, 'r') as file:
829
+ diarized_segments = json.load(file)
830
+ if not diarized_segments:
831
+ logging.warning(f"Diarized JSON file is empty, re-generating: {diarized_json_path}")
832
+ raise ValueError("Empty diarized JSON file")
833
+ logging.debug(f"Loaded diarized segments from {diarized_json_path}")
834
+ return audio_file_path, diarized_segments
835
+ except (json.JSONDecodeError, ValueError) as e:
836
+ logging.error(f"Failed to read or parse the diarized JSON file: {e}")
837
+ os.remove(diarized_json_path)
838
+
839
+ # If diarized file doesn't exist or was corrupted, generate new diarized transcription
840
+ logging.info(f"Generating diarized transcription for {audio_file_path}")
841
+ diarized_segments = combine_transcription_and_diarization(audio_file_path)
842
+
843
+ # Save diarized segments
844
+ with open(diarized_json_path, 'w') as file:
845
+ json.dump(diarized_segments, file, indent=2)
846
+
847
+ return audio_file_path, diarized_segments
848
+
849
+ # Non-diarized transcription (existing functionality)
850
+ if os.path.exists(segments_json_path):
851
+ logging.info(f"Segments file already exists: {segments_json_path}")
852
+ try:
853
+ with open(segments_json_path, 'r') as file:
854
+ segments = json.load(file)
855
+ if not segments:
856
+ logging.warning(f"Segments JSON file is empty, re-generating: {segments_json_path}")
857
+ raise ValueError("Empty segments JSON file")
858
+ logging.debug(f"Loaded segments from {segments_json_path}")
859
+ except (json.JSONDecodeError, ValueError) as e:
860
+ logging.error(f"Failed to read or parse the segments JSON file: {e}")
861
+ os.remove(segments_json_path)
862
+ logging.info(f"Re-generating transcription for {audio_file_path}")
863
+ audio_file, segments = re_generate_transcription(audio_file_path, whisper_model, vad_filter)
864
+ if segments is None:
865
+ return None, None
866
+ else:
867
+ audio_file, segments = re_generate_transcription(audio_file_path, whisper_model, vad_filter)
868
+
869
+ return audio_file_path, segments
870
+
871
+
872
+ def re_generate_transcription(audio_file_path, whisper_model, vad_filter):
873
+ try:
874
+ segments = speech_to_text(audio_file_path, whisper_model=whisper_model, vad_filter=vad_filter)
875
+ # Save segments to JSON
876
+ with open(segments_json_path, 'w') as file:
877
+ json.dump(segments, file, indent=2)
878
+ logging.debug(f"Transcription segments saved to {segments_json_path}")
879
+ return audio_file_path, segments
880
+ except Exception as e:
881
+ logging.error(f"Error in re-generating transcription: {str(e)}")
882
+ return None, None
883
+
884
+
885
+ def save_transcription_and_summary(transcription_text, summary_text, download_path, info_dict):
886
+ try:
887
+ video_title = sanitize_filename(info_dict.get('title', 'Untitled'))
888
+
889
+ # Save transcription
890
+ transcription_file_path = os.path.join(download_path, f"{video_title}_transcription.txt")
891
+ with open(transcription_file_path, 'w', encoding='utf-8') as f:
892
+ f.write(transcription_text)
893
+
894
+ # Save summary if available
895
+ summary_file_path = None
896
+ if summary_text:
897
+ summary_file_path = os.path.join(download_path, f"{video_title}_summary.txt")
898
+ with open(summary_file_path, 'w', encoding='utf-8') as f:
899
+ f.write(summary_text)
900
+
901
+ return transcription_file_path, summary_file_path
902
+ except Exception as e:
903
+ logging.error(f"Error in save_transcription_and_summary: {str(e)}", exc_info=True)
904
+ return None, None
905
+
906
+
907
+ def summarize_chunk(api_name, text, custom_prompt_input, api_key):
908
+ try:
909
+ if api_name.lower() == 'openai':
910
+ return summarize_with_openai(api_key, text, custom_prompt_input)
911
+ elif api_name.lower() == "anthropic":
912
+ return summarize_with_anthropic(api_key, text, custom_prompt_input)
913
+ elif api_name.lower() == "cohere":
914
+ return summarize_with_cohere(api_key, text, custom_prompt_input)
915
+ elif api_name.lower() == "groq":
916
+ return summarize_with_groq(api_key, text, custom_prompt_input)
917
+ elif api_name.lower() == "openrouter":
918
+ return summarize_with_openrouter(api_key, text, custom_prompt_input)
919
+ elif api_name.lower() == "deepseek":
920
+ return summarize_with_deepseek(api_key, text, custom_prompt_input)
921
+ elif api_name.lower() == "llama.cpp":
922
+ return summarize_with_llama(text, custom_prompt_input)
923
+ elif api_name.lower() == "kobold":
924
+ return summarize_with_kobold(text, api_key, custom_prompt_input)
925
+ elif api_name.lower() == "ooba":
926
+ return summarize_with_oobabooga(text, api_key, custom_prompt_input)
927
+ elif api_name.lower() == "tabbyapi":
928
+ return summarize_with_tabbyapi(text, custom_prompt_input)
929
+ elif api_name.lower() == "vllm":
930
+ return summarize_with_vllm(text, custom_prompt_input)
931
+ elif api_name.lower() == "local-llm":
932
+ return summarize_with_local_llm(text, custom_prompt_input)
933
+ elif api_name.lower() == "huggingface":
934
+ return summarize_with_huggingface(api_key, text, custom_prompt_input)
935
+ else:
936
+ logging.warning(f"Unsupported API: {api_name}")
937
+ return None
938
+ except Exception as e:
939
+ logging.error(f"Error in summarize_chunk with {api_name}: {str(e)}")
940
+ return None
941
+
942
+
943
+ def extract_metadata_and_content(input_data):
944
+ metadata = {}
945
+ content = ""
946
+
947
+ if isinstance(input_data, str):
948
+ if os.path.exists(input_data):
949
+ with open(input_data, 'r', encoding='utf-8') as file:
950
+ data = json.load(file)
951
+ else:
952
+ try:
953
+ data = json.loads(input_data)
954
+ except json.JSONDecodeError:
955
+ return {}, input_data
956
+ elif isinstance(input_data, dict):
957
+ data = input_data
958
+ else:
959
+ return {}, str(input_data)
960
+
961
+ # Extract metadata
962
+ metadata['title'] = data.get('title', 'No title available')
963
+ metadata['author'] = data.get('author', 'Unknown author')
964
+
965
+ # Extract content
966
+ if 'transcription' in data:
967
+ content = extract_text_from_segments(data['transcription'])
968
+ elif 'segments' in data:
969
+ content = extract_text_from_segments(data['segments'])
970
+ elif 'content' in data:
971
+ content = data['content']
972
+ else:
973
+ content = json.dumps(data)
974
+
975
+ return metadata, content
976
+
977
+ def extract_text_from_segments(segments):
978
+ if isinstance(segments, list):
979
+ return ' '.join([seg.get('Text', '') for seg in segments if 'Text' in seg])
980
+ return str(segments)
981
+
982
+ def format_input_with_metadata(metadata, content):
983
+ formatted_input = f"Title: {metadata.get('title', 'No title available')}\n"
984
+ formatted_input += f"Author: {metadata.get('author', 'Unknown author')}\n\n"
985
+ formatted_input += content
986
+ return formatted_input
987
+
988
+ def perform_summarization(api_name, input_data, custom_prompt_input, api_key, recursive_summarization=False):
989
+ loaded_config_data = load_and_log_configs()
990
+
991
+ if custom_prompt_input is None:
992
+ custom_prompt_input = """
993
+ You are a bulleted notes specialist. ```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.
994
+ **Bulleted Note Creation Guidelines**
995
+
996
+ **Headings**:
997
+ - Based on referenced topics, not categories like quotes or terms
998
+ - Surrounded by **bold** formatting
999
+ - Not listed as bullet points
1000
+ - No space between headings and list items underneath
1001
+
1002
+ **Emphasis**:
1003
+ - **Important terms** set in bold font
1004
+ - **Text ending in a colon**: also bolded
1005
+
1006
+ **Review**:
1007
+ - Ensure adherence to specified format
1008
+ - Do not reference these instructions in your response.</s>[INST] {{ .Prompt }} [/INST]"""
1009
+
1010
+ try:
1011
+ logging.debug(f"Input data type: {type(input_data)}")
1012
+ logging.debug(f"Input data (first 500 chars): {str(input_data)[:500]}...")
1013
+
1014
+ # Extract metadata and content
1015
+ metadata, content = extract_metadata_and_content(input_data)
1016
+
1017
+ logging.debug(f"Extracted metadata: {metadata}")
1018
+ logging.debug(f"Extracted content (first 500 chars): {content[:500]}...")
1019
+
1020
+ # Prepare a structured input for summarization
1021
+ structured_input = format_input_with_metadata(metadata, content)
1022
+
1023
+ # Perform summarization on the structured input
1024
+ if recursive_summarization:
1025
+ chunk_options = {
1026
+ 'method': 'words', # or 'sentences', 'paragraphs', 'tokens' based on your preference
1027
+ 'max_size': 1000, # adjust as needed
1028
+ 'overlap': 100, # adjust as needed
1029
+ 'adaptive': False,
1030
+ 'multi_level': False,
1031
+ 'language': 'english'
1032
+ }
1033
+ chunks = improved_chunking_process(structured_input, chunk_options)
1034
+ summary = recursive_summarize_chunks([chunk['text'] for chunk in chunks],
1035
+ lambda x: summarize_chunk(api_name, x, custom_prompt_input, api_key),
1036
+ custom_prompt_input)
1037
+ else:
1038
+ summary = summarize_chunk(api_name, structured_input, custom_prompt_input, api_key)
1039
+
1040
+ if summary:
1041
+ logging.info(f"Summary generated using {api_name} API")
1042
+ if isinstance(input_data, str) and os.path.exists(input_data):
1043
+ summary_file_path = input_data.replace('.json', '_summary.txt')
1044
+ with open(summary_file_path, 'w', encoding='utf-8') as file:
1045
+ file.write(summary)
1046
+ else:
1047
+ logging.warning(f"Failed to generate summary using {api_name} API")
1048
+
1049
+ return summary
1050
+
1051
+ except requests.exceptions.ConnectionError:
1052
+ logging.error("Connection error while summarizing")
1053
+ except Exception as e:
1054
+ logging.error(f"Error summarizing with {api_name}: {str(e)}", exc_info=True)
1055
+ return f"An error occurred during summarization: {str(e)}"
1056
+ return None
1057
+
1058
+ def extract_text_from_input(input_data):
1059
+ if isinstance(input_data, str):
1060
+ try:
1061
+ # Try to parse as JSON
1062
+ data = json.loads(input_data)
1063
+ except json.JSONDecodeError:
1064
+ # If not valid JSON, treat as plain text
1065
+ return input_data
1066
+ elif isinstance(input_data, dict):
1067
+ data = input_data
1068
+ else:
1069
+ return str(input_data)
1070
+
1071
+ # Extract relevant fields from the JSON object
1072
+ text_parts = []
1073
+ if 'title' in data:
1074
+ text_parts.append(f"Title: {data['title']}")
1075
+ if 'description' in data:
1076
+ text_parts.append(f"Description: {data['description']}")
1077
+ if 'transcription' in data:
1078
+ if isinstance(data['transcription'], list):
1079
+ transcription_text = ' '.join([segment.get('Text', '') for segment in data['transcription']])
1080
+ elif isinstance(data['transcription'], str):
1081
+ transcription_text = data['transcription']
1082
+ else:
1083
+ transcription_text = str(data['transcription'])
1084
+ text_parts.append(f"Transcription: {transcription_text}")
1085
+ elif 'segments' in data:
1086
+ segments_text = extract_text_from_segments(data['segments'])
1087
+ text_parts.append(f"Segments: {segments_text}")
1088
+
1089
+ return '\n\n'.join(text_parts)
1090
+
1091
+
1092
+
1093
+ def process_url(
1094
+ url,
1095
+ num_speakers,
1096
+ whisper_model,
1097
+ custom_prompt_input,
1098
+ offset,
1099
+ api_name,
1100
+ api_key,
1101
+ vad_filter,
1102
+ download_video_flag,
1103
+ download_audio,
1104
+ rolling_summarization,
1105
+ detail_level,
1106
+ # It's for the asking a question about a returned prompt - needs to be removed #FIXME
1107
+ question_box,
1108
+ keywords,
1109
+ chunk_text_by_words,
1110
+ max_words,
1111
+ chunk_text_by_sentences,
1112
+ max_sentences,
1113
+ chunk_text_by_paragraphs,
1114
+ max_paragraphs,
1115
+ chunk_text_by_tokens,
1116
+ max_tokens,
1117
+ chunk_by_semantic,
1118
+ semantic_chunk_size,
1119
+ semantic_chunk_overlap,
1120
+ local_file_path=None,
1121
+ diarize=False,
1122
+ recursive_summarization=False
1123
+ ):
1124
+ # Handle the chunk summarization options
1125
+ set_chunk_txt_by_words = chunk_text_by_words
1126
+ set_max_txt_chunk_words = max_words
1127
+ set_chunk_txt_by_sentences = chunk_text_by_sentences
1128
+ set_max_txt_chunk_sentences = max_sentences
1129
+ set_chunk_txt_by_paragraphs = chunk_text_by_paragraphs
1130
+ set_max_txt_chunk_paragraphs = max_paragraphs
1131
+ set_chunk_txt_by_tokens = chunk_text_by_tokens
1132
+ set_max_txt_chunk_tokens = max_tokens
1133
+ set_chunk_txt_by_semantic = chunk_by_semantic
1134
+ set_semantic_chunk_size = semantic_chunk_size
1135
+ set_semantic_chunk_overlap = semantic_chunk_overlap
1136
+
1137
+ progress = []
1138
+ success_message = "All videos processed successfully. Transcriptions and summaries have been ingested into the database."
1139
+
1140
+ if custom_prompt_input is None:
1141
+ custom_prompt_input = """
1142
+ You are a bulleted notes specialist. ```When creating comprehensive bulleted notes, you should follow these guidelines: Use multiple headings based on the referenced topics, not categories like quotes or terms. Headings should be surrounded by bold formatting and not be listed as bullet points themselves. Leave no space between headings and their corresponding list items underneath. Important terms within the content should be emphasized by setting them in bold font. Any text that ends with a colon should also be bolded. Before submitting your response, review the instructions, and make any corrections necessary to adhered to the specified format. Do not reference these instructions within the notes.``` \nBased on the content between backticks create comprehensive bulleted notes.
1143
+ **Bulleted Note Creation Guidelines**
1144
+
1145
+ **Headings**:
1146
+ - Based on referenced topics, not categories like quotes or terms
1147
+ - Surrounded by **bold** formatting
1148
+ - Not listed as bullet points
1149
+ - No space between headings and list items underneath
1150
+
1151
+ **Emphasis**:
1152
+ - **Important terms** set in bold font
1153
+ - **Text ending in a colon**: also bolded
1154
+
1155
+ **Review**:
1156
+ - Ensure adherence to specified format
1157
+ - Do not reference these instructions in your response.</s>[INST] {{ .Prompt }} [/INST]"""
1158
+
1159
+ # Validate input
1160
+ if not url and not local_file_path:
1161
+ return "Process_URL: No URL provided.", "No URL provided.", None, None, None, None, None, None
1162
+
1163
+ # FIXME - Chatgpt again?
1164
+ if isinstance(url, str):
1165
+ urls = url.strip().split('\n')
1166
+ if len(urls) > 1:
1167
+ return process_video_urls(urls, num_speakers, whisper_model, custom_prompt_input, offset, api_name, api_key, vad_filter,
1168
+ download_video_flag, download_audio, rolling_summarization, detail_level, question_box,
1169
+ keywords, chunk_text_by_words, max_words, chunk_text_by_sentences, max_sentences,
1170
+ chunk_text_by_paragraphs, max_paragraphs, chunk_text_by_tokens, max_tokens, chunk_by_semantic, semantic_chunk_size, semantic_chunk_overlap)
1171
+ else:
1172
+ urls = [url]
1173
+
1174
+ if url and not is_valid_url(url):
1175
+ return "Process_URL: Invalid URL format.", "Invalid URL format.", None, None, None, None, None, None
1176
+
1177
+ if url:
1178
+ # Clean the URL to remove playlist parameters if any
1179
+ url = clean_youtube_url(url)
1180
+ logging.info(f"Process_URL: Processing URL: {url}")
1181
+
1182
+ if api_name:
1183
+ print("Process_URL: API Name received:", api_name) # Debugging line
1184
+
1185
+ video_file_path = None
1186
+ global info_dict
1187
+
1188
+ # FIXME - need to handle local audio file processing
1189
+ # If Local audio file is provided
1190
+ if local_file_path:
1191
+ try:
1192
+ pass
1193
+ # # insert code to process local audio file
1194
+ # # Need to be able to add a title/author/etc for ingestion into the database
1195
+ # # Also want to be able to optionally _just_ ingest it, and not ingest.
1196
+ # # FIXME
1197
+ # #download_path = create_download_directory(title)
1198
+ # #audio_path = download_video(url, download_path, info_dict, download_video_flag)
1199
+ #
1200
+ # audio_file_path = local_file_path
1201
+ # global segments
1202
+ # audio_file_path, segments = perform_transcription(audio_file_path, offset, whisper_model, vad_filter)
1203
+ #
1204
+ # if audio_file_path is None or segments is None:
1205
+ # logging.error("Process_URL: Transcription failed or segments not available.")
1206
+ # return "Process_URL: Transcription failed.", "Transcription failed.", None, None, None, None
1207
+ #
1208
+ # logging.debug(f"Process_URL: Transcription audio_file: {audio_file_path}")
1209
+ # logging.debug(f"Process_URL: Transcription segments: {segments}")
1210
+ #
1211
+ # transcription_text = {'audio_file': audio_file_path, 'transcription': segments}
1212
+ # logging.debug(f"Process_URL: Transcription text: {transcription_text}")
1213
+
1214
+ # Rolling Summarization Processing
1215
+ # if rolling_summarization:
1216
+ # text = extract_text_from_segments(segments)
1217
+ # summary_text = rolling_summarize_function(
1218
+ # transcription_text,
1219
+ # detail=detail_level,
1220
+ # api_name=api_name,
1221
+ # api_key=api_key,
1222
+ # custom_prompt=custom_prompt,
1223
+ # chunk_by_words=chunk_text_by_words,
1224
+ # max_words=max_words,
1225
+ # chunk_by_sentences=chunk_text_by_sentences,
1226
+ # max_sentences=max_sentences,
1227
+ # chunk_by_paragraphs=chunk_text_by_paragraphs,
1228
+ # max_paragraphs=max_paragraphs,
1229
+ # chunk_by_tokens=chunk_text_by_tokens,
1230
+ # max_tokens=max_tokens
1231
+ # )
1232
+ # if api_name:
1233
+ # summary_text = perform_summarization(api_name, segments_json_path, custom_prompt, api_key, config)
1234
+ # if summary_text is None:
1235
+ # logging.error("Summary text is None. Check summarization function.")
1236
+ # summary_file_path = None # Set summary_file_path to None if summary is not generated
1237
+ # else:
1238
+ # summary_text = 'Summary not available'
1239
+ # summary_file_path = None # Set summary_file_path to None if summary is not generated
1240
+ #
1241
+ # json_file_path, summary_file_path = save_transcription_and_summary(transcription_text, summary_text, download_path)
1242
+ #
1243
+ # add_media_to_database(url, info_dict, segments, summary_text, keywords, custom_prompt, whisper_model)
1244
+ #
1245
+ # return transcription_text, summary_text, json_file_path, summary_file_path, None, None
1246
+
1247
+ except Exception as e:
1248
+ logging.error(f": {e}")
1249
+ return str(e), 'process_url: Error processing the request.', None, None, None, None
1250
+
1251
+
1252
+ # If URL/Local video file is provided
1253
+ try:
1254
+ info_dict, title = extract_video_info(url)
1255
+ download_path = create_download_directory(title)
1256
+ video_path = download_video(url, download_path, info_dict, download_video_flag)
1257
+ global segments
1258
+ audio_file_path, segments = perform_transcription(video_path, offset, whisper_model, vad_filter)
1259
+
1260
+ if diarize:
1261
+ transcription_text = combine_transcription_and_diarization(audio_file_path)
1262
+ else:
1263
+ audio_file, segments = perform_transcription(video_path, offset, whisper_model, vad_filter)
1264
+ transcription_text = {'audio_file': audio_file, 'transcription': segments}
1265
+
1266
+
1267
+ if audio_file_path is None or segments is None:
1268
+ logging.error("Process_URL: Transcription failed or segments not available.")
1269
+ return "Process_URL: Transcription failed.", "Transcription failed.", None, None, None, None
1270
+
1271
+ logging.debug(f"Process_URL: Transcription audio_file: {audio_file_path}")
1272
+ logging.debug(f"Process_URL: Transcription segments: {segments}")
1273
+
1274
+ logging.debug(f"Process_URL: Transcription text: {transcription_text}")
1275
+
1276
+ # FIXME - Implement chunking calls here
1277
+ # Implement chunking calls here
1278
+ chunked_transcriptions = []
1279
+ if chunk_text_by_words:
1280
+ chunked_transcriptions = chunk_text_by_words(transcription_text['transcription'], max_words)
1281
+ elif chunk_text_by_sentences:
1282
+ chunked_transcriptions = chunk_text_by_sentences(transcription_text['transcription'], max_sentences)
1283
+ elif chunk_text_by_paragraphs:
1284
+ chunked_transcriptions = chunk_text_by_paragraphs(transcription_text['transcription'], max_paragraphs)
1285
+ elif chunk_text_by_tokens:
1286
+ chunked_transcriptions = chunk_text_by_tokens(transcription_text['transcription'], max_tokens)
1287
+ elif chunk_by_semantic:
1288
+ chunked_transcriptions = semantic_chunking(transcription_text['transcription'], semantic_chunk_size, 'tokens')
1289
+
1290
+ # If we did chunking, we now have the chunked transcripts in 'chunked_transcriptions'
1291
+ elif rolling_summarization:
1292
+ # FIXME - rolling summarization
1293
+ # text = extract_text_from_segments(segments)
1294
+ # summary_text = rolling_summarize_function(
1295
+ # transcription_text,
1296
+ # detail=detail_level,
1297
+ # api_name=api_name,
1298
+ # api_key=api_key,
1299
+ # custom_prompt_input=custom_prompt_input,
1300
+ # chunk_by_words=chunk_text_by_words,
1301
+ # max_words=max_words,
1302
+ # chunk_by_sentences=chunk_text_by_sentences,
1303
+ # max_sentences=max_sentences,
1304
+ # chunk_by_paragraphs=chunk_text_by_paragraphs,
1305
+ # max_paragraphs=max_paragraphs,
1306
+ # chunk_by_tokens=chunk_text_by_tokens,
1307
+ # max_tokens=max_tokens
1308
+ # )
1309
+ pass
1310
+ else:
1311
+ pass
1312
+
1313
+ summarized_chunk_transcriptions = []
1314
+
1315
+ if chunk_text_by_words or chunk_text_by_sentences or chunk_text_by_paragraphs or chunk_text_by_tokens or chunk_by_semantic and api_name:
1316
+ # Perform summarization based on chunks
1317
+ for chunk in chunked_transcriptions:
1318
+ summarized_chunks = []
1319
+ if api_name == "anthropic":
1320
+ summary = summarize_with_anthropic(api_key, chunk, custom_prompt_input)
1321
+ elif api_name == "cohere":
1322
+ summary = summarize_with_cohere(api_key, chunk, custom_prompt_input)
1323
+ elif api_name == "openai":
1324
+ summary = summarize_with_openai(api_key, chunk, custom_prompt_input)
1325
+ elif api_name == "Groq":
1326
+ summary = summarize_with_groq(api_key, chunk, custom_prompt_input)
1327
+ elif api_name == "DeepSeek":
1328
+ summary = summarize_with_deepseek(api_key, chunk, custom_prompt_input)
1329
+ elif api_name == "OpenRouter":
1330
+ summary = summarize_with_openrouter(api_key, chunk, custom_prompt_input)
1331
+ elif api_name == "Llama.cpp":
1332
+ summary = summarize_with_llama(chunk, custom_prompt_input)
1333
+ elif api_name == "Kobold":
1334
+ summary = summarize_with_kobold(chunk, custom_prompt_input)
1335
+ elif api_name == "Ooba":
1336
+ summary = summarize_with_oobabooga(chunk, custom_prompt_input)
1337
+ elif api_name == "Tabbyapi":
1338
+ summary = summarize_with_tabbyapi(chunk, custom_prompt_input)
1339
+ elif api_name == "VLLM":
1340
+ summary = summarize_with_vllm(chunk, custom_prompt_input)
1341
+ summarized_chunk_transcriptions.append(summary)
1342
+
1343
+ # Combine chunked transcriptions into a single file
1344
+ combined_transcription_text = '\n\n'.join(chunked_transcriptions)
1345
+ combined_transcription_file_path = os.path.join(download_path, 'combined_transcription.txt')
1346
+ with open(combined_transcription_file_path, 'w') as f:
1347
+ f.write(combined_transcription_text)
1348
+
1349
+ # Combine summarized chunk transcriptions into a single file
1350
+ combined_summary_text = '\n\n'.join(summarized_chunk_transcriptions)
1351
+ combined_summary_file_path = os.path.join(download_path, 'combined_summary.txt')
1352
+ with open(combined_summary_file_path, 'w') as f:
1353
+ f.write(combined_summary_text)
1354
+
1355
+ # Handle rolling summarization
1356
+ if rolling_summarization:
1357
+ summary_text = rolling_summarize(
1358
+ text=extract_text_from_segments(segments),
1359
+ detail=detail_level,
1360
+ model='gpt-4-turbo',
1361
+ additional_instructions=custom_prompt_input,
1362
+ summarize_recursively=recursive_summarization
1363
+ )
1364
+ elif api_name:
1365
+ summary_text = perform_summarization(api_name, segments_json_path, custom_prompt_input, api_key,
1366
+ recursive_summarization)
1367
+ else:
1368
+ summary_text = 'Summary not available'
1369
+
1370
+ # Check to see if chunking was performed, and if so, return that instead
1371
+ if chunk_text_by_words or chunk_text_by_sentences or chunk_text_by_paragraphs or chunk_text_by_tokens or chunk_by_semantic:
1372
+ # Combine chunked transcriptions into a single file
1373
+ # FIXME - validate this works....
1374
+ json_file_path, summary_file_path = save_transcription_and_summary(combined_transcription_file_path, combined_summary_file_path, download_path)
1375
+ add_media_to_database(url, info_dict, segments, summary_text, keywords, custom_prompt_input, whisper_model)
1376
+ return transcription_text, summary_text, json_file_path, summary_file_path, None, None
1377
+ else:
1378
+ json_file_path, summary_file_path = save_transcription_and_summary(transcription_text, summary_text, download_path)
1379
+ add_media_to_database(url, info_dict, segments, summary_text, keywords, custom_prompt_input, whisper_model)
1380
+ return transcription_text, summary_text, json_file_path, summary_file_path, None, None
1381
+
1382
+ except Exception as e:
1383
+ logging.error(f": {e}")
1384
+ return str(e), 'process_url: Error processing the request.', None, None, None, None
1385
+
1386
+ #
1387
+ #
1388
+ ############################################################################################################################################
App_Function_Libraries/System_Checks_Lib.py ADDED
@@ -0,0 +1,184 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # System_Checks_Lib.py
2
+ #########################################
3
+ # System Checks Library
4
+ # This library is used to check the system for the necessary dependencies to run the script.
5
+ # It checks for the OS, the availability of the GPU, and the availability of the ffmpeg executable.
6
+ # If the GPU is available, it asks the user if they would like to use it for processing.
7
+ # If ffmpeg is not found, it asks the user if they would like to download it.
8
+ # The script will exit if the user chooses not to download ffmpeg.
9
+ ####
10
+
11
+ ####################
12
+ # Function List
13
+ #
14
+ # 1. platform_check()
15
+ # 2. cuda_check()
16
+ # 3. decide_cpugpu()
17
+ # 4. check_ffmpeg()
18
+ # 5. download_ffmpeg()
19
+ #
20
+ ####################
21
+
22
+
23
+
24
+
25
+ # Import necessary libraries
26
+ import logging
27
+ import os
28
+ import platform
29
+ import requests
30
+ import shutil
31
+ import subprocess
32
+ import zipfile
33
+ # Import Local Libraries
34
+ #from App_Function_Libraries import
35
+ #
36
+ #######################################################################################################################
37
+ # Function Definitions
38
+ #
39
+
40
+ def platform_check():
41
+ global userOS
42
+ if platform.system() == "Linux":
43
+ print("Linux OS detected \n Running Linux appropriate commands")
44
+ userOS = "Linux"
45
+ elif platform.system() == "Windows":
46
+ print("Windows OS detected \n Running Windows appropriate commands")
47
+ userOS = "Windows"
48
+ else:
49
+ print("Other OS detected \n Maybe try running things manually?")
50
+ exit()
51
+
52
+
53
+ # Check for NVIDIA GPU and CUDA availability
54
+ def cuda_check():
55
+ global processing_choice
56
+ try:
57
+ # Run nvidia-smi to capture its output
58
+ nvidia_smi_output = subprocess.check_output("nvidia-smi", shell=True).decode()
59
+
60
+ # Look for CUDA version in the output
61
+ if "CUDA Version" in nvidia_smi_output:
62
+ cuda_version = next(
63
+ (line.split(":")[-1].strip() for line in nvidia_smi_output.splitlines() if "CUDA Version" in line),
64
+ "Not found")
65
+ print(f"NVIDIA GPU with CUDA Version {cuda_version} is available.")
66
+ processing_choice = "cuda"
67
+ else:
68
+ print("CUDA is not installed or configured correctly.")
69
+ processing_choice = "cpu"
70
+
71
+ except subprocess.CalledProcessError as e:
72
+ print(f"Failed to run 'nvidia-smi': {str(e)}")
73
+ processing_choice = "cpu"
74
+ except Exception as e:
75
+ print(f"An error occurred: {str(e)}")
76
+ processing_choice = "cpu"
77
+
78
+ # Optionally, check for the CUDA_VISIBLE_DEVICES env variable as an additional check
79
+ if "CUDA_VISIBLE_DEVICES" in os.environ:
80
+ print("CUDA_VISIBLE_DEVICES is set:", os.environ["CUDA_VISIBLE_DEVICES"])
81
+ else:
82
+ print("CUDA_VISIBLE_DEVICES not set.")
83
+
84
+
85
+ # Ask user if they would like to use either their GPU or their CPU for transcription
86
+ def decide_cpugpu():
87
+ global processing_choice
88
+ processing_input = input("Would you like to use your GPU or CPU for transcription? (1/cuda)GPU/(2/cpu)CPU): ")
89
+ if processing_choice == "cuda" and (processing_input.lower() == "cuda" or processing_input == "1"):
90
+ print("You've chosen to use the GPU.")
91
+ logging.debug("GPU is being used for processing")
92
+ processing_choice = "cuda"
93
+ elif processing_input.lower() == "cpu" or processing_input == "2":
94
+ print("You've chosen to use the CPU.")
95
+ logging.debug("CPU is being used for processing")
96
+ processing_choice = "cpu"
97
+ else:
98
+ print("Invalid choice. Please select either GPU or CPU.")
99
+
100
+
101
+ # check for existence of ffmpeg
102
+ def check_ffmpeg():
103
+ if shutil.which("ffmpeg") or (os.path.exists("Bin") and os.path.isfile(".\\Bin\\ffmpeg.exe")):
104
+ logging.debug("ffmpeg found installed on the local system, in the local PATH, or in the './Bin' folder")
105
+ pass
106
+ else:
107
+ logging.debug("ffmpeg not installed on the local system/in local PATH")
108
+ print(
109
+ "ffmpeg is not installed.\n\n You can either install it manually, or through your package manager of "
110
+ "choice.\n Windows users, builds are here: https://www.gyan.dev/ffmpeg/builds/")
111
+ if userOS == "Windows":
112
+ download_ffmpeg()
113
+ elif userOS == "Linux":
114
+ print(
115
+ "You should install ffmpeg using your platform's appropriate package manager, 'apt install ffmpeg',"
116
+ "'dnf install ffmpeg' or 'pacman', etc.")
117
+ else:
118
+ logging.debug("running an unsupported OS")
119
+ print("You're running an unspported/Un-tested OS")
120
+ exit_script = input("Let's exit the script, unless you're feeling lucky? (y/n)")
121
+ if exit_script == "y" or "yes" or "1":
122
+ exit()
123
+
124
+
125
+ # Download ffmpeg
126
+ def download_ffmpeg():
127
+ user_choice = input("Do you want to download ffmpeg? (y)Yes/(n)No: ")
128
+ if user_choice.lower() in ['yes', 'y', '1']:
129
+ print("Downloading ffmpeg")
130
+ url = "https://www.gyan.dev/ffmpeg/builds/ffmpeg-release-essentials.zip"
131
+ response = requests.get(url)
132
+
133
+ if response.status_code == 200:
134
+ print("Saving ffmpeg zip file")
135
+ logging.debug("Saving ffmpeg zip file")
136
+ zip_path = "ffmpeg-release-essentials.zip"
137
+ with open(zip_path, 'wb') as file:
138
+ file.write(response.content)
139
+
140
+ logging.debug("Extracting the 'ffmpeg.exe' file from the zip")
141
+ print("Extracting ffmpeg.exe from zip file to '/Bin' folder")
142
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
143
+ # Find the ffmpeg.exe file within the zip
144
+ ffmpeg_path = None
145
+ for file_info in zip_ref.infolist():
146
+ if file_info.filename.endswith("ffmpeg.exe"):
147
+ ffmpeg_path = file_info.filename
148
+ break
149
+
150
+ if ffmpeg_path is None:
151
+ logging.error("ffmpeg.exe not found in the zip file.")
152
+ print("ffmpeg.exe not found in the zip file.")
153
+ return
154
+
155
+ logging.debug("checking if the './Bin' folder exists, creating if not")
156
+ bin_folder = "Bin"
157
+ if not os.path.exists(bin_folder):
158
+ logging.debug("Creating a folder for './Bin', it didn't previously exist")
159
+ os.makedirs(bin_folder)
160
+
161
+ logging.debug("Extracting 'ffmpeg.exe' to the './Bin' folder")
162
+ zip_ref.extract(ffmpeg_path, path=bin_folder)
163
+
164
+ logging.debug("Moving 'ffmpeg.exe' to the './Bin' folder")
165
+ src_path = os.path.join(bin_folder, ffmpeg_path)
166
+ dst_path = os.path.join(bin_folder, "ffmpeg.exe")
167
+ shutil.move(src_path, dst_path)
168
+
169
+ logging.debug("Removing ffmpeg zip file")
170
+ print("Deleting zip file (we've already extracted ffmpeg.exe, no worries)")
171
+ os.remove(zip_path)
172
+
173
+ logging.debug("ffmpeg.exe has been downloaded and extracted to the './Bin' folder.")
174
+ print("ffmpeg.exe has been successfully downloaded and extracted to the './Bin' folder.")
175
+ else:
176
+ logging.error("Failed to download the zip file.")
177
+ print("Failed to download the zip file.")
178
+ else:
179
+ logging.debug("User chose to not download ffmpeg")
180
+ print("ffmpeg will not be downloaded.")
181
+
182
+ #
183
+ #
184
+ #######################################################################################################################
App_Function_Libraries/Tokenization_Methods_Lib.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Tokenization_Methods_Lib.py
2
+ #########################################
3
+ # Tokenization Methods Library
4
+ # This library is used to handle tokenization of text for summarization.
5
+ #
6
+ ####
7
+ import tiktoken
8
+
9
+ # Import Local
10
+ from typing import List
11
+
12
+ ####################
13
+ # Function List
14
+ #
15
+ # 1. openai_tokenize(text: str) -> List[str]
16
+ #
17
+ ####################
18
+
19
+
20
+ #######################################################################################################################
21
+ # Function Definitions
22
+ #
23
+
24
+ def openai_tokenize(text: str) -> List[str]:
25
+ encoding = tiktoken.encoding_for_model('gpt-4-turbo')
26
+ return encoding.encode(text)
27
+
28
+ #
29
+ #
30
+ #######################################################################################################################
App_Function_Libraries/Tone-Changer.py ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import json
3
+ from transformers import pipeline
4
+
5
+ # Initialize the text generation pipeline
6
+ generator = pipeline('text-generation', model='gpt2')
7
+
8
+
9
+ def adjust_tone(text, concise, casual):
10
+ tones = [
11
+ {"tone": "concise", "weight": concise},
12
+ {"tone": "casual", "weight": casual},
13
+ {"tone": "professional", "weight": 1 - casual},
14
+ {"tone": "expanded", "weight": 1 - concise}
15
+ ]
16
+ tones = sorted(tones, key=lambda x: x['weight'], reverse=True)[:2]
17
+
18
+ tone_prompt = " and ".join([f"{t['tone']} (weight: {t['weight']:.2f})" for t in tones])
19
+
20
+ prompt = f"Rewrite the following text to match these tones: {tone_prompt}. Text: {text}"
21
+
22
+ result = generator(prompt, max_length=100, num_return_sequences=1)
23
+ return result[0]['generated_text']
24
+
25
+
26
+ # Gradio Interface
27
+ with gr.Blocks() as demo:
28
+ gr.Markdown("# Tone Adjuster")
29
+
30
+ input_text = gr.Textbox(label="Input Text")
31
+
32
+ with gr.Row():
33
+ concise_slider = gr.Slider(minimum=0, maximum=1, value=0.5, label="Concise vs Expanded")
34
+ casual_slider = gr.Slider(minimum=0, maximum=1, value=0.5, label="Casual vs Professional")
35
+
36
+ output_text = gr.Textbox(label="Adjusted Text")
37
+
38
+ adjust_btn = gr.Button("Adjust Tone")
39
+
40
+ adjust_btn.click(
41
+ adjust_tone,
42
+ inputs=[input_text, concise_slider, casual_slider],
43
+ outputs=output_text
44
+ )
45
+
46
+ demo.launch()
App_Function_Libraries/Utils.py ADDED
@@ -0,0 +1,440 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Utils.py
2
+ #########################################
3
+ # General Utilities Library
4
+ # This library is used to hold random utilities used by various other libraries.
5
+ #
6
+ ####
7
+ ####################
8
+ # Function List
9
+ #
10
+ # 1. extract_text_from_segments(segments: List[Dict]) -> str
11
+ # 2. download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5)
12
+ # 3. verify_checksum(file_path, expected_checksum)
13
+ # 4. create_download_directory(title)
14
+ # 5. sanitize_filename(filename)
15
+ # 6. normalize_title(title)
16
+ # 7.
17
+ #
18
+ #
19
+ #
20
+ ####################
21
+ # Import necessary libraries
22
+ import configparser
23
+ import hashlib
24
+ import json
25
+ import logging
26
+ from datetime import timedelta
27
+ from urllib.parse import urlparse, parse_qs, urlencode, urlunparse
28
+
29
+ import requests
30
+ import time
31
+ from tqdm import tqdm
32
+ import os
33
+ import re
34
+ import unicodedata
35
+
36
+ from App_Function_Libraries.Video_DL_Ingestion_Lib import get_youtube
37
+
38
+
39
+ #######################################################################################################################
40
+ # Function Definitions
41
+ #
42
+
43
+ def extract_text_from_segments(segments):
44
+ logging.debug(f"Segments received: {segments}")
45
+ logging.debug(f"Type of segments: {type(segments)}")
46
+
47
+ def extract_text_recursive(data):
48
+ if isinstance(data, dict):
49
+ for key, value in data.items():
50
+ if key == 'Text':
51
+ return value
52
+ elif isinstance(value, (dict, list)):
53
+ result = extract_text_recursive(value)
54
+ if result:
55
+ return result
56
+ elif isinstance(data, list):
57
+ return ' '.join(filter(None, [extract_text_recursive(item) for item in data]))
58
+ return None
59
+
60
+ text = extract_text_recursive(segments)
61
+
62
+ if text:
63
+ return text.strip()
64
+ else:
65
+ logging.error(f"Unable to extract text from segments: {segments}")
66
+ return "Error: Unable to extract transcription"
67
+
68
+
69
+ def download_file(url, dest_path, expected_checksum=None, max_retries=3, delay=5):
70
+ temp_path = dest_path + '.tmp'
71
+
72
+ for attempt in range(max_retries):
73
+ try:
74
+ # Check if a partial download exists and get its size
75
+ resume_header = {}
76
+ if os.path.exists(temp_path):
77
+ resume_header = {'Range': f'bytes={os.path.getsize(temp_path)}-'}
78
+
79
+ response = requests.get(url, stream=True, headers=resume_header)
80
+ response.raise_for_status()
81
+
82
+ # Get the total file size from headers
83
+ total_size = int(response.headers.get('content-length', 0))
84
+ initial_pos = os.path.getsize(temp_path) if os.path.exists(temp_path) else 0
85
+
86
+ mode = 'ab' if 'Range' in response.headers else 'wb'
87
+ with open(temp_path, mode) as temp_file, tqdm(
88
+ total=total_size, unit='B', unit_scale=True, desc=dest_path, initial=initial_pos, ascii=True
89
+ ) as pbar:
90
+ for chunk in response.iter_content(chunk_size=8192):
91
+ if chunk: # filter out keep-alive new chunks
92
+ temp_file.write(chunk)
93
+ pbar.update(len(chunk))
94
+
95
+ # Verify the checksum if provided
96
+ if expected_checksum:
97
+ if not verify_checksum(temp_path, expected_checksum):
98
+ os.remove(temp_path)
99
+ raise ValueError("Downloaded file's checksum does not match the expected checksum")
100
+
101
+ # Move the file to the final destination
102
+ os.rename(temp_path, dest_path)
103
+ print("Download complete and verified!")
104
+ return dest_path
105
+
106
+ except Exception as e:
107
+ print(f"Attempt {attempt + 1} failed: {e}")
108
+ if attempt < max_retries - 1:
109
+ print(f"Retrying in {delay} seconds...")
110
+ time.sleep(delay)
111
+ else:
112
+ print("Max retries reached. Download failed.")
113
+ raise
114
+
115
+
116
+ def verify_checksum(file_path, expected_checksum):
117
+ sha256_hash = hashlib.sha256()
118
+ with open(file_path, 'rb') as f:
119
+ for byte_block in iter(lambda: f.read(4096), b''):
120
+ sha256_hash.update(byte_block)
121
+ return sha256_hash.hexdigest() == expected_checksum
122
+
123
+
124
+ def create_download_directory(title):
125
+ base_dir = "Results"
126
+ # Remove characters that are illegal in Windows filenames and normalize
127
+ safe_title = normalize_title(title)
128
+ logging.debug(f"{title} successfully normalized")
129
+ session_path = os.path.join(base_dir, safe_title)
130
+ if not os.path.exists(session_path):
131
+ os.makedirs(session_path, exist_ok=True)
132
+ logging.debug(f"Created directory for downloaded video: {session_path}")
133
+ else:
134
+ logging.debug(f"Directory already exists for downloaded video: {session_path}")
135
+ return session_path
136
+
137
+
138
+ def sanitize_filename(filename):
139
+ # Remove invalid characters and replace spaces with underscores
140
+ sanitized = re.sub(r'[<>:"/\\|?*]', '', filename)
141
+ sanitized = re.sub(r'\s+', ' ', sanitized).strip()
142
+ return sanitized
143
+
144
+
145
+ def normalize_title(title):
146
+ # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
147
+ title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
148
+ title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '').replace('*', '').replace('?',
149
+ '').replace(
150
+ '<', '').replace('>', '').replace('|', '')
151
+ return title
152
+
153
+
154
+
155
+
156
+ def clean_youtube_url(url):
157
+ parsed_url = urlparse(url)
158
+ query_params = parse_qs(parsed_url.query)
159
+ if 'list' in query_params:
160
+ query_params.pop('list')
161
+ cleaned_query = urlencode(query_params, doseq=True)
162
+ cleaned_url = urlunparse(parsed_url._replace(query=cleaned_query))
163
+ return cleaned_url
164
+
165
+
166
+ def extract_video_info(url):
167
+ info_dict = get_youtube(url)
168
+ title = info_dict.get('title', 'Untitled')
169
+ return info_dict, title
170
+
171
+
172
+ def clean_youtube_url(url):
173
+ parsed_url = urlparse(url)
174
+ query_params = parse_qs(parsed_url.query)
175
+ if 'list' in query_params:
176
+ query_params.pop('list')
177
+ cleaned_query = urlencode(query_params, doseq=True)
178
+ cleaned_url = urlunparse(parsed_url._replace(query=cleaned_query))
179
+ return cleaned_url
180
+
181
+ def extract_video_info(url):
182
+ info_dict = get_youtube(url)
183
+ title = info_dict.get('title', 'Untitled')
184
+ return info_dict, title
185
+
186
+ def import_data(file):
187
+ # Implement this function to import data from a file
188
+ pass
189
+
190
+
191
+
192
+
193
+ #######################
194
+ # Config loading
195
+ #
196
+
197
+ def load_comprehensive_config():
198
+ # Get the directory of the current script
199
+ current_dir = os.path.dirname(os.path.abspath(__file__))
200
+ # Go up one level to the project root directory
201
+ project_root = os.path.dirname(current_dir)
202
+ # Construct the path to the config file in the project root directory
203
+ config_path = os.path.join(project_root, 'config.txt')
204
+ # Create a ConfigParser object
205
+ config = configparser.ConfigParser()
206
+ # Read the configuration file
207
+ files_read = config.read(config_path)
208
+ if not files_read:
209
+ raise FileNotFoundError(f"Config file not found at {config_path}")
210
+ return config
211
+
212
+
213
+ def load_and_log_configs():
214
+ try:
215
+ config = load_comprehensive_config()
216
+ if config is None:
217
+ logging.error("Config is None, cannot proceed")
218
+ return None
219
+ # API Keys
220
+ anthropic_api_key = config.get('API', 'anthropic_api_key', fallback=None)
221
+ logging.debug(
222
+ f"Loaded Anthropic API Key: {anthropic_api_key[:5]}...{anthropic_api_key[-5:] if anthropic_api_key else None}")
223
+
224
+ cohere_api_key = config.get('API', 'cohere_api_key', fallback=None)
225
+ logging.debug(
226
+ f"Loaded Cohere API Key: {cohere_api_key[:5]}...{cohere_api_key[-5:] if cohere_api_key else None}")
227
+
228
+ groq_api_key = config.get('API', 'groq_api_key', fallback=None)
229
+ logging.debug(f"Loaded Groq API Key: {groq_api_key[:5]}...{groq_api_key[-5:] if groq_api_key else None}")
230
+
231
+ openai_api_key = config.get('API', 'openai_api_key', fallback=None)
232
+ logging.debug(
233
+ f"Loaded OpenAI API Key: {openai_api_key[:5]}...{openai_api_key[-5:] if openai_api_key else None}")
234
+
235
+ huggingface_api_key = config.get('API', 'huggingface_api_key', fallback=None)
236
+ logging.debug(
237
+ f"Loaded HuggingFace API Key: {huggingface_api_key[:5]}...{huggingface_api_key[-5:] if huggingface_api_key else None}")
238
+
239
+ openrouter_api_key = config.get('API', 'openrouter_api_key', fallback=None)
240
+ logging.debug(
241
+ f"Loaded OpenRouter API Key: {openrouter_api_key[:5]}...{openrouter_api_key[-5:] if openrouter_api_key else None}")
242
+
243
+ deepseek_api_key = config.get('API', 'deepseek_api_key', fallback=None)
244
+ logging.debug(
245
+ f"Loaded DeepSeek API Key: {deepseek_api_key[:5]}...{deepseek_api_key[-5:] if deepseek_api_key else None}")
246
+
247
+ # Models
248
+ anthropic_model = config.get('API', 'anthropic_model', fallback='claude-3-sonnet-20240229')
249
+ cohere_model = config.get('API', 'cohere_model', fallback='command-r-plus')
250
+ groq_model = config.get('API', 'groq_model', fallback='llama3-70b-8192')
251
+ openai_model = config.get('API', 'openai_model', fallback='gpt-4-turbo')
252
+ huggingface_model = config.get('API', 'huggingface_model', fallback='CohereForAI/c4ai-command-r-plus')
253
+ openrouter_model = config.get('API', 'openrouter_model', fallback='microsoft/wizardlm-2-8x22b')
254
+ deepseek_model = config.get('API', 'deepseek_model', fallback='deepseek-chat')
255
+
256
+ logging.debug(f"Loaded Anthropic Model: {anthropic_model}")
257
+ logging.debug(f"Loaded Cohere Model: {cohere_model}")
258
+ logging.debug(f"Loaded Groq Model: {groq_model}")
259
+ logging.debug(f"Loaded OpenAI Model: {openai_model}")
260
+ logging.debug(f"Loaded HuggingFace Model: {huggingface_model}")
261
+ logging.debug(f"Loaded OpenRouter Model: {openrouter_model}")
262
+
263
+ # Local-Models
264
+ kobold_api_IP = config.get('Local-API', 'kobold_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
265
+ kobold_api_key = config.get('Local-API', 'kobold_api_key', fallback='')
266
+
267
+ llama_api_IP = config.get('Local-API', 'llama_api_IP', fallback='http://127.0.0.1:8080/v1/chat/completions')
268
+ llama_api_key = config.get('Local-API', 'llama_api_key', fallback='')
269
+
270
+ ooba_api_IP = config.get('Local-API', 'ooba_api_IP', fallback='http://127.0.0.1:5000/v1/chat/completions')
271
+ ooba_api_key = config.get('Local-API', 'ooba_api_key', fallback='')
272
+
273
+ tabby_api_IP = config.get('Local-API', 'tabby_api_IP', fallback='http://127.0.0.1:5000/api/v1/generate')
274
+ tabby_api_key = config.get('Local-API', 'tabby_api_key', fallback=None)
275
+
276
+ vllm_api_url = config.get('Local-API', 'vllm_api_IP', fallback='http://127.0.0.1:500/api/v1/chat/completions')
277
+ vllm_api_key = config.get('Local-API', 'vllm_api_key', fallback=None)
278
+
279
+ logging.debug(f"Loaded Kobold API IP: {kobold_api_IP}")
280
+ logging.debug(f"Loaded Llama API IP: {llama_api_IP}")
281
+ logging.debug(f"Loaded Ooba API IP: {ooba_api_IP}")
282
+ logging.debug(f"Loaded Tabby API IP: {tabby_api_IP}")
283
+ logging.debug(f"Loaded VLLM API URL: {vllm_api_url}")
284
+
285
+ # Retrieve output paths from the configuration file
286
+ output_path = config.get('Paths', 'output_path', fallback='results')
287
+ logging.debug(f"Output path set to: {output_path}")
288
+
289
+ # Retrieve processing choice from the configuration file
290
+ processing_choice = config.get('Processing', 'processing_choice', fallback='cpu')
291
+ logging.debug(f"Processing choice set to: {processing_choice}")
292
+
293
+ # Prompts - FIXME
294
+ prompt_path = config.get('Prompts', 'prompt_path', fallback='prompts.db')
295
+
296
+ return {
297
+ 'api_keys': {
298
+ 'anthropic': anthropic_api_key,
299
+ 'cohere': cohere_api_key,
300
+ 'groq': groq_api_key,
301
+ 'openai': openai_api_key,
302
+ 'huggingface': huggingface_api_key,
303
+ 'openrouter': openrouter_api_key,
304
+ 'deepseek': deepseek_api_key
305
+ },
306
+ 'models': {
307
+ 'anthropic': anthropic_model,
308
+ 'cohere': cohere_model,
309
+ 'groq': groq_model,
310
+ 'openai': openai_model,
311
+ 'huggingface': huggingface_model,
312
+ 'openrouter': openrouter_model,
313
+ 'deepseek': deepseek_model
314
+ },
315
+ 'local_apis': {
316
+ 'kobold': {'ip': kobold_api_IP, 'key': kobold_api_key},
317
+ 'llama': {'ip': llama_api_IP, 'key': llama_api_key},
318
+ 'ooba': {'ip': ooba_api_IP, 'key': ooba_api_key},
319
+ 'tabby': {'ip': tabby_api_IP, 'key': tabby_api_key},
320
+ 'vllm': {'ip': vllm_api_url, 'key': vllm_api_key}
321
+ },
322
+ 'output_path': output_path,
323
+ 'processing_choice': processing_choice
324
+ }
325
+
326
+ except Exception as e:
327
+ logging.error(f"Error loading config: {str(e)}")
328
+ return None
329
+
330
+
331
+
332
+ # Log file
333
+ # logging.basicConfig(filename='debug-runtime.log', encoding='utf-8', level=logging.DEBUG)
334
+
335
+
336
+
337
+
338
+
339
+
340
+
341
+ def format_metadata_as_text(metadata):
342
+ if not metadata:
343
+ return "No metadata available"
344
+
345
+ formatted_text = "Video Metadata:\n"
346
+ for key, value in metadata.items():
347
+ if value is not None:
348
+ if isinstance(value, list):
349
+ # Join list items with commas
350
+ formatted_value = ", ".join(str(item) for item in value)
351
+ elif key == 'upload_date' and len(str(value)) == 8:
352
+ # Format date as YYYY-MM-DD
353
+ formatted_value = f"{value[:4]}-{value[4:6]}-{value[6:]}"
354
+ elif key in ['view_count', 'like_count']:
355
+ # Format large numbers with commas
356
+ formatted_value = f"{value:,}"
357
+ elif key == 'duration':
358
+ # Convert seconds to HH:MM:SS format
359
+ hours, remainder = divmod(value, 3600)
360
+ minutes, seconds = divmod(remainder, 60)
361
+ formatted_value = f"{hours:02d}:{minutes:02d}:{seconds:02d}"
362
+ else:
363
+ formatted_value = str(value)
364
+
365
+ formatted_text += f"{key.capitalize()}: {formatted_value}\n"
366
+ return formatted_text.strip()
367
+
368
+ # # Example usage:
369
+ # example_metadata = {
370
+ # 'title': 'Sample Video Title',
371
+ # 'uploader': 'Channel Name',
372
+ # 'upload_date': '20230615',
373
+ # 'view_count': 1000000,
374
+ # 'like_count': 50000,
375
+ # 'duration': 3725, # 1 hour, 2 minutes, 5 seconds
376
+ # 'tags': ['tag1', 'tag2', 'tag3'],
377
+ # 'description': 'This is a sample video description.'
378
+ # }
379
+ #
380
+ # print(format_metadata_as_text(example_metadata))
381
+
382
+
383
+
384
+ def convert_to_seconds(time_str):
385
+ if not time_str:
386
+ return 0
387
+
388
+ # If it's already a number, assume it's in seconds
389
+ if time_str.isdigit():
390
+ return int(time_str)
391
+
392
+ # Parse time string in format HH:MM:SS, MM:SS, or SS
393
+ time_parts = time_str.split(':')
394
+ if len(time_parts) == 3:
395
+ return int(timedelta(hours=int(time_parts[0]),
396
+ minutes=int(time_parts[1]),
397
+ seconds=int(time_parts[2])).total_seconds())
398
+ elif len(time_parts) == 2:
399
+ return int(timedelta(minutes=int(time_parts[0]),
400
+ seconds=int(time_parts[1])).total_seconds())
401
+ elif len(time_parts) == 1:
402
+ return int(time_parts[0])
403
+ else:
404
+ raise ValueError(f"Invalid time format: {time_str}")
405
+
406
+
407
+ def save_to_file(video_urls, filename):
408
+ with open(filename, 'w') as file:
409
+ file.write('\n'.join(video_urls))
410
+ print(f"Video URLs saved to {filename}")
411
+
412
+
413
+ def save_segments_to_json(segments, file_name="transcription_segments.json"):
414
+ """
415
+ Save transcription segments to a JSON file.
416
+
417
+ Parameters:
418
+ segments (list): List of transcription segments
419
+ file_name (str): Name of the JSON file to save (default: "transcription_segments.json")
420
+
421
+ Returns:
422
+ str: Path to the saved JSON file
423
+ """
424
+ # Ensure the Results directory exists
425
+ os.makedirs("Results", exist_ok=True)
426
+
427
+ # Full path for the JSON file
428
+ json_file_path = os.path.join("Results", file_name)
429
+
430
+ # Save segments to JSON file
431
+ with open(json_file_path, 'w', encoding='utf-8') as json_file:
432
+ json.dump(segments, json_file, ensure_ascii=False, indent=4)
433
+
434
+ return json_file_path
435
+
436
+
437
+
438
+
439
+
440
+
App_Function_Libraries/Video_DL_Ingestion_Lib.py ADDED
@@ -0,0 +1,315 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Video_DL_Ingestion_Lib.py
2
+ #########################################
3
+ # Video Downloader and Ingestion Library
4
+ # This library is used to handle downloading videos from YouTube and other platforms.
5
+ # It also handles the ingestion of the videos into the database.
6
+ # It uses yt-dlp to extract video information and download the videos.
7
+ ####
8
+ import json
9
+ ####################
10
+ # Function List
11
+ #
12
+ # 1. get_video_info(url)
13
+ # 2. create_download_directory(title)
14
+ # 3. sanitize_filename(title)
15
+ # 4. normalize_title(title)
16
+ # 5. get_youtube(video_url)
17
+ # 6. get_playlist_videos(playlist_url)
18
+ # 7. download_video(video_url, download_path, info_dict, download_video_flag)
19
+ # 8. save_to_file(video_urls, filename)
20
+ # 9. save_summary_to_file(summary, file_path)
21
+ # 10. process_url(url, num_speakers, whisper_model, custom_prompt, offset, api_name, api_key, vad_filter, download_video, download_audio, rolling_summarization, detail_level, question_box, keywords, chunk_summarization, chunk_duration_input, words_per_second_input)
22
+ #
23
+ #
24
+ ####################
25
+ # Import necessary libraries to run solo for testing
26
+ import logging
27
+ import os
28
+ import re
29
+ import sys
30
+ from urllib.parse import urlparse, parse_qs
31
+
32
+ import unicodedata
33
+ # 3rd-Party Imports
34
+ import yt_dlp
35
+ # Import Local
36
+ #
37
+ #######################################################################################################################
38
+ # Function Definitions
39
+ #
40
+
41
+ def normalize_title(title):
42
+ # Normalize the string to 'NFKD' form and encode to 'ascii' ignoring non-ascii characters
43
+ title = unicodedata.normalize('NFKD', title).encode('ascii', 'ignore').decode('ascii')
44
+ title = title.replace('/', '_').replace('\\', '_').replace(':', '_').replace('"', '').replace('*', '').replace('?',
45
+ '').replace(
46
+ '<', '').replace('>', '').replace('|', '')
47
+ return title
48
+
49
+ def get_video_info(url: str) -> dict:
50
+ ydl_opts = {
51
+ 'quiet': True,
52
+ 'no_warnings': True,
53
+ 'skip_download': True,
54
+ }
55
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
56
+ try:
57
+ info_dict = ydl.extract_info(url, download=False)
58
+ return info_dict
59
+ except Exception as e:
60
+ logging.error(f"Error extracting video info: {e}")
61
+ return None
62
+
63
+
64
+ def get_youtube(video_url):
65
+ ydl_opts = {
66
+ 'format': 'bestaudio[ext=m4a]',
67
+ 'noplaylist': False,
68
+ 'quiet': True,
69
+ 'extract_flat': True
70
+ }
71
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
72
+ logging.debug("About to extract youtube info")
73
+ info_dict = ydl.extract_info(video_url, download=False)
74
+ logging.debug("Youtube info successfully extracted")
75
+ return info_dict
76
+
77
+
78
+ def get_playlist_videos(playlist_url):
79
+ ydl_opts = {
80
+ 'extract_flat': True,
81
+ 'skip_download': True,
82
+ 'quiet': True
83
+ }
84
+
85
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
86
+ info = ydl.extract_info(playlist_url, download=False)
87
+
88
+ if 'entries' in info:
89
+ video_urls = [entry['url'] for entry in info['entries']]
90
+ playlist_title = info['title']
91
+ return video_urls, playlist_title
92
+ else:
93
+ print("No videos found in the playlist.")
94
+ return [], None
95
+
96
+
97
+ def download_video(video_url, download_path, info_dict, download_video_flag):
98
+ global video_file_path, ffmpeg_path
99
+ global audio_file_path
100
+
101
+ # Normalize Video Title name
102
+ logging.debug("About to normalize downloaded video title")
103
+ if 'title' not in info_dict or 'ext' not in info_dict:
104
+ logging.error("info_dict is missing 'title' or 'ext'")
105
+ return None
106
+
107
+ normalized_video_title = normalize_title(info_dict['title'])
108
+ video_file_path = os.path.join(download_path, f"{normalized_video_title}.{info_dict['ext']}")
109
+
110
+ # Check for existence of video file
111
+ if os.path.exists(video_file_path):
112
+ logging.info(f"Video file already exists: {video_file_path}")
113
+ return video_file_path
114
+
115
+ # Setup path handling for ffmpeg on different OSs
116
+ if sys.platform.startswith('win'):
117
+ ffmpeg_path = os.path.join(os.getcwd(), 'Bin', 'ffmpeg.exe')
118
+ elif sys.platform.startswith('linux'):
119
+ ffmpeg_path = 'ffmpeg'
120
+ elif sys.platform.startswith('darwin'):
121
+ ffmpeg_path = 'ffmpeg'
122
+
123
+ if download_video_flag:
124
+ video_file_path = os.path.join(download_path, f"{normalized_video_title}.mp4")
125
+ ydl_opts_video = {
126
+ 'format': 'bestvideo[ext=mp4]+bestaudio[ext=m4a]',
127
+ 'outtmpl': video_file_path,
128
+ 'ffmpeg_location': ffmpeg_path
129
+ }
130
+
131
+ try:
132
+ with yt_dlp.YoutubeDL(ydl_opts_video) as ydl:
133
+ logging.debug("yt_dlp: About to download video with youtube-dl")
134
+ ydl.download([video_url])
135
+ logging.debug("yt_dlp: Video successfully downloaded with youtube-dl")
136
+ if os.path.exists(video_file_path):
137
+ return video_file_path
138
+ else:
139
+ logging.error("yt_dlp: Video file not found after download")
140
+ return None
141
+ except Exception as e:
142
+ logging.error(f"yt_dlp: Error downloading video: {e}")
143
+ return None
144
+ elif not download_video_flag:
145
+ video_file_path = os.path.join(download_path, f"{normalized_video_title}.mp4")
146
+ # Set options for video and audio
147
+ ydl_opts = {
148
+ 'format': 'bestaudio[ext=m4a]',
149
+ 'quiet': True,
150
+ 'outtmpl': video_file_path
151
+ }
152
+
153
+ try:
154
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
155
+ logging.debug("yt_dlp: About to download video with youtube-dl")
156
+ ydl.download([video_url])
157
+ logging.debug("yt_dlp: Video successfully downloaded with youtube-dl")
158
+ if os.path.exists(video_file_path):
159
+ return video_file_path
160
+ else:
161
+ logging.error("yt_dlp: Video file not found after download")
162
+ return None
163
+ except Exception as e:
164
+ logging.error(f"yt_dlp: Error downloading video: {e}")
165
+ return None
166
+
167
+ else:
168
+ logging.debug("download_video: Download video flag is set to False and video file path is not found")
169
+ return None
170
+
171
+
172
+ def extract_video_info(url):
173
+ try:
174
+ with yt_dlp.YoutubeDL({'quiet': True}) as ydl:
175
+ info = ydl.extract_info(url, download=False)
176
+
177
+ # Log only a subset of the info to avoid overwhelming the logs
178
+ log_info = {
179
+ 'title': info.get('title'),
180
+ 'duration': info.get('duration'),
181
+ 'upload_date': info.get('upload_date')
182
+ }
183
+ logging.debug(f"Extracted info for {url}: {log_info}")
184
+
185
+ return info
186
+ except Exception as e:
187
+ logging.error(f"Error extracting video info for {url}: {str(e)}", exc_info=True)
188
+ return None
189
+
190
+
191
+ def get_youtube_playlist_urls(playlist_id):
192
+ ydl_opts = {
193
+ 'extract_flat': True,
194
+ 'quiet': True,
195
+ }
196
+
197
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
198
+ result = ydl.extract_info(f'https://www.youtube.com/playlist?list={playlist_id}', download=False)
199
+ return [entry['url'] for entry in result['entries'] if entry.get('url')]
200
+
201
+
202
+ def parse_and_expand_urls(url_input):
203
+ logging.info(f"Starting parse_and_expand_urls with input: {url_input}")
204
+ urls = [url.strip() for url in url_input.split('\n') if url.strip()]
205
+ logging.info(f"Parsed URLs: {urls}")
206
+ expanded_urls = []
207
+
208
+ for url in urls:
209
+ try:
210
+ logging.info(f"Processing URL: {url}")
211
+ parsed_url = urlparse(url)
212
+ logging.debug(f"Parsed URL components: {parsed_url}")
213
+
214
+ # YouTube playlist handling
215
+ if 'youtube.com' in parsed_url.netloc and 'list' in parsed_url.query:
216
+ playlist_id = parse_qs(parsed_url.query)['list'][0]
217
+ logging.info(f"Detected YouTube playlist with ID: {playlist_id}")
218
+ playlist_urls = get_youtube_playlist_urls(playlist_id)
219
+ logging.info(f"Expanded playlist URLs: {playlist_urls}")
220
+ expanded_urls.extend(playlist_urls)
221
+
222
+ # YouTube short URL handling
223
+ elif 'youtu.be' in parsed_url.netloc:
224
+ video_id = parsed_url.path.lstrip('/')
225
+ full_url = f'https://www.youtube.com/watch?v={video_id}'
226
+ logging.info(f"Expanded YouTube short URL to: {full_url}")
227
+ expanded_urls.append(full_url)
228
+
229
+ # Vimeo handling
230
+ elif 'vimeo.com' in parsed_url.netloc:
231
+ video_id = parsed_url.path.lstrip('/')
232
+ full_url = f'https://vimeo.com/{video_id}'
233
+ logging.info(f"Processed Vimeo URL: {full_url}")
234
+ expanded_urls.append(full_url)
235
+
236
+ # Add more platform-specific handling here
237
+
238
+ else:
239
+ logging.info(f"URL not recognized as special case, adding as-is: {url}")
240
+ expanded_urls.append(url)
241
+
242
+ except Exception as e:
243
+ logging.error(f"Error processing URL {url}: {str(e)}", exc_info=True)
244
+ # Optionally, you might want to add the problematic URL to expanded_urls
245
+ # expanded_urls.append(url)
246
+
247
+ logging.info(f"Final expanded URLs: {expanded_urls}")
248
+ return expanded_urls
249
+
250
+
251
+ def extract_metadata(url, use_cookies=False, cookies=None):
252
+ ydl_opts = {
253
+ 'quiet': True,
254
+ 'no_warnings': True,
255
+ 'extract_flat': True,
256
+ 'skip_download': True,
257
+ }
258
+
259
+ if use_cookies and cookies:
260
+ try:
261
+ cookie_dict = json.loads(cookies)
262
+ ydl_opts['cookiefile'] = cookie_dict
263
+ except json.JSONDecodeError:
264
+ logging.warning("Invalid cookie format. Proceeding without cookies.")
265
+
266
+ with yt_dlp.YoutubeDL(ydl_opts) as ydl:
267
+ try:
268
+ info = ydl.extract_info(url, download=False)
269
+ metadata = {
270
+ 'title': info.get('title'),
271
+ 'uploader': info.get('uploader'),
272
+ 'upload_date': info.get('upload_date'),
273
+ 'view_count': info.get('view_count'),
274
+ 'like_count': info.get('like_count'),
275
+ 'duration': info.get('duration'),
276
+ 'tags': info.get('tags'),
277
+ 'description': info.get('description')
278
+ }
279
+
280
+ # Create a safe subset of metadata to log
281
+ safe_metadata = {
282
+ 'title': metadata.get('title', 'No title'),
283
+ 'duration': metadata.get('duration', 'Unknown duration'),
284
+ 'upload_date': metadata.get('upload_date', 'Unknown upload date'),
285
+ 'uploader': metadata.get('uploader', 'Unknown uploader')
286
+ }
287
+
288
+ logging.info(f"Successfully extracted metadata for {url}: {safe_metadata}")
289
+ return metadata
290
+ except Exception as e:
291
+ logging.error(f"Error extracting metadata for {url}: {str(e)}", exc_info=True)
292
+ return None
293
+
294
+
295
+ def generate_timestamped_url(url, hours, minutes, seconds):
296
+ # Extract video ID from the URL
297
+ video_id_match = re.search(r'(?:v=|\/)([0-9A-Za-z_-]{11}).*', url)
298
+ if not video_id_match:
299
+ return "Invalid YouTube URL"
300
+
301
+ video_id = video_id_match.group(1)
302
+
303
+ # Calculate total seconds
304
+ total_seconds = int(hours) * 3600 + int(minutes) * 60 + int(seconds)
305
+
306
+ # Generate the new URL
307
+ new_url = f"https://www.youtube.com/watch?v={video_id}&t={total_seconds}s"
308
+
309
+ return new_url
310
+
311
+
312
+
313
+ #
314
+ #
315
+ #######################################################################################################################
App_Function_Libraries/__Init__.py ADDED
File without changes
App_Function_Libraries/__pycache__/Article_Extractor_Lib.cpython-312.pyc ADDED
Binary file (4.89 kB). View file
 
App_Function_Libraries/__pycache__/Article_Summarization_Lib.cpython-312.pyc ADDED
Binary file (12.1 kB). View file
 
App_Function_Libraries/__pycache__/Audio_Files.cpython-312.pyc ADDED
Binary file (23.2 kB). View file
 
App_Function_Libraries/__pycache__/Audio_Transcription_Lib.cpython-312.pyc ADDED
Binary file (7.45 kB). View file
 
App_Function_Libraries/__pycache__/Book_Ingestion_Lib.cpython-312.pyc ADDED
Binary file (2.99 kB). View file
 
App_Function_Libraries/__pycache__/Chunk_Lib.cpython-312.pyc ADDED
Binary file (19.3 kB). View file
 
App_Function_Libraries/__pycache__/Diarization_Lib.cpython-312.pyc ADDED
Binary file (8.47 kB). View file
 
App_Function_Libraries/__pycache__/Gradio_Related.cpython-312.pyc ADDED
Binary file (109 kB). View file
 
App_Function_Libraries/__pycache__/LLM_API_Calls.cpython-312.pyc ADDED
Binary file (28.4 kB). View file
 
App_Function_Libraries/__pycache__/Local_File_Processing_Lib.cpython-312.pyc ADDED
Binary file (3.66 kB). View file
 
App_Function_Libraries/__pycache__/Local_LLM_Inference_Engine_Lib.cpython-312.pyc ADDED
Binary file (22 kB). View file
 
App_Function_Libraries/__pycache__/Local_Summarization_Lib.cpython-312.pyc ADDED
Binary file (21.2 kB). View file
 
App_Function_Libraries/__pycache__/Old_Chunking_Lib.cpython-312.pyc ADDED
Binary file (3.3 kB). View file
 
App_Function_Libraries/__pycache__/PDF_Ingestion_Lib.cpython-312.pyc ADDED
Binary file (6.61 kB). View file
 
App_Function_Libraries/__pycache__/SQLite_DB.cpython-312.pyc ADDED
Binary file (43.8 kB). View file
 
App_Function_Libraries/__pycache__/Summarization_General_Lib.cpython-312.pyc ADDED
Binary file (61 kB). View file
 
App_Function_Libraries/__pycache__/System_Checks_Lib.cpython-312.pyc ADDED
Binary file (8.34 kB). View file
 
App_Function_Libraries/__pycache__/Tokenization_Methods_Lib.cpython-312.pyc ADDED
Binary file (565 Bytes). View file
 
App_Function_Libraries/__pycache__/Utils.cpython-312.pyc ADDED
Binary file (18.8 kB). View file
 
App_Function_Libraries/__pycache__/Video_DL_Ingestion_Lib.cpython-312.pyc ADDED
Binary file (13.6 kB). View file
 
App_Function_Libraries/__pycache__/__init__.cpython-312.pyc ADDED
Binary file (164 Bytes). View file
 
App_Function_Libraries/models/config.yaml ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 3.1.0
2
+
3
+ pipeline:
4
+ name: pyannote.audio.pipelines.SpeakerDiarization
5
+ params:
6
+ clustering: AgglomerativeClustering
7
+ # embedding: pyannote/wespeaker-voxceleb-resnet34-LM # If you want to use the HF model
8
+ embedding: pyannote_model_wespeaker-voxceleb-resnet34-LM.bin # If you want to use the local model
9
+ embedding_batch_size: 32
10
+ embedding_exclude_overlap: true
11
+ # segmentation: pyannote/segmentation-3.0 # If you want to use the HF model
12
+ segmentation: pyannote_model_segmentation-3.0.bin # If you want to use the local model
13
+ segmentation_batch_size: 32
14
+
15
+ params:
16
+ clustering:
17
+ method: centroid
18
+ min_cluster_size: 12
19
+ threshold: 0.7045654963945799
20
+ segmentation:
21
+ min_duration_off: 0.0
app.py CHANGED
The diff for this file is too large to render. See raw diff